Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_ExtraKernels_def.hpp
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
11#define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
12#include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
13#include "Tpetra_RowMatrixTransposer.hpp"
14
15namespace Tpetra {
16
17namespace MatrixMatrix {
18
19namespace ExtraKernels {
20
21// Double product version
22template <class CrsMatrixType>
23size_t C_estimate_nnz_per_row(CrsMatrixType& A, CrsMatrixType& B) {
24 // Follows the NZ estimate in ML's ml_matmatmult.c
25 size_t Aest = 100, Best = 100;
26 if (A.getLocalNumEntries() > 0)
27 Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 100;
28 if (B.getLocalNumEntries() > 0)
29 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries() / B.getLocalNumRows() : 100;
30
31 size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
32 nnzperrow *= nnzperrow;
33
34 return nnzperrow;
35}
36
37// Triple product version
38template <class CrsMatrixType>
39size_t Ac_estimate_nnz(CrsMatrixType& A, CrsMatrixType& P) {
40 size_t nnzPerRowA = 100, Pcols = 100;
41 if (A.getLocalNumEntries() > 0)
42 nnzPerRowA = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 9;
43 if (P.getLocalNumEntries() > 0)
44 Pcols = (P.getLocalNumCols() > 0) ? P.getLocalNumCols() : 100;
45 return (size_t)(Pcols * nnzPerRowA + 5 * nnzPerRowA + 300);
46}
47
48#if defined(HAVE_TPETRA_INST_OPENMP)
49/*********************************************************************************************************/
50template <class Scalar,
51 class LocalOrdinal,
52 class GlobalOrdinal,
53 class LocalOrdinalViewType>
54void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
55 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
56 const LocalOrdinalViewType& targetMapToOrigRow,
57 const LocalOrdinalViewType& targetMapToImportRow,
58 const LocalOrdinalViewType& Bcol2Ccol,
59 const LocalOrdinalViewType& Icol2Ccol,
60 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
61 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
62 const std::string& label,
63 const Teuchos::RCP<Teuchos::ParameterList>& params) {
64#ifdef HAVE_TPETRA_MMM_TIMINGS
65 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
66 using Teuchos::TimeMonitor;
67 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix LTGCore"))));
68#endif
69 using Teuchos::Array;
70 using Teuchos::ArrayRCP;
71 using Teuchos::ArrayView;
72 using Teuchos::RCP;
73 using Teuchos::rcp;
74
75 // Lots and lots of typedefs
77 typedef typename KCRS::device_type device_t;
78 typedef typename KCRS::StaticCrsGraphType graph_t;
79 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
80 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
81 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
82 typedef typename KCRS::values_type::non_const_type scalar_view_t;
83
84 // Unmanaged versions of the above
85 // typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
86 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
87 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
88
89 typedef Scalar SC;
90 typedef LocalOrdinal LO;
91 typedef GlobalOrdinal GO;
92 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
93 typedef Map<LO, GO, NO> map_type;
94
95 // NOTE (mfh 15 Sep 2017) This is specifically only for
96 // execution_space = Kokkos::OpenMP, so we neither need nor want
97 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
98 typedef NO::execution_space execution_space;
99 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
100
101 // All of the invalid guys
102 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
103 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
104 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
105
106 bool skipExplicitZero = true;
107 if (params && params->isParameter("MM Skip Explicit Zeros")) {
108 skipExplicitZero = params->get<bool>("MM Skip Explicit Zeros");
109 }
110
111 // Grab the Kokkos::SparseCrsMatrices & inner stuff
112 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
113 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
114
115 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
116 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
117 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
118 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
119
120 c_lno_view_t Irowptr;
121 lno_nnz_view_t Icolind;
122 scalar_view_t Ivals;
123 if (!Bview.importMatrix.is_null()) {
124 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
125 Irowptr = lclB.graph.row_map;
126 Icolind = lclB.graph.entries;
127 Ivals = lclB.values;
128 b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
129 }
130
131 // Sizes
132 RCP<const map_type> Ccolmap = C.getColMap();
133 size_t m = Aview.origMatrix->getLocalNumRows();
134 size_t n = Ccolmap->getLocalNumElements();
135 size_t Cest_nnz_per_row = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
136
137 // Get my node / thread info (right from openmp or parameter list)
138 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
139 if (!params.is_null()) {
140 if (params->isParameter("openmp: ltg thread max"))
141 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
142 }
143
144 // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
145 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
146 // we will not touch these until the final copyout step
147 lno_nnz_view_t entriesC;
148 scalar_view_t valuesC;
149
150 // add this, since we do the rowptr in place, we could figure this out
151 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
152 lno_nnz_view_t thread_total_nnz("thread_total_nnz", thread_max + 1);
153
154 // Thread-local memory
155 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
156 Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
157
158 double thread_chunk = (double)(m) / thread_max;
159
160 // Run chunks of the matrix independently
161 Kokkos::parallel_for("MMM::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
162 // Thread coordination stuff
163 size_t my_thread_start = tid * thread_chunk;
164 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
165 size_t my_thread_m = my_thread_stop - my_thread_start;
166
167 // Size estimate
168 size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
169
170 // Allocations
171 std::vector<size_t> c_status(n, INVALID);
172
173 u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
174 u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
175
176 // For each row of A/C
177 size_t CSR_ip = 0, OLD_ip = 0;
178 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
179 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
180 // on the calling process.
181 // JJE 10 Apr 2019 index directly into the rowptr
182 row_mapC(i) = CSR_ip;
183
184 // mfh 27 Sep 2016: For each entry of A in the current row of A
185 for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
186 LO Aik = Acolind(k); // local column index of current entry of A
187 const SC Aval = Avals(k); // value of current entry of A
188 if (Aval == SC_ZERO && skipExplicitZero)
189 continue; // skip explicitly stored zero values in A
190
191 if (targetMapToOrigRow(Aik) != LO_INVALID) {
192 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
193 // corresponding to the current entry of A is populated, then
194 // the corresponding row of B is in B_local (i.e., it lives on
195 // the calling process).
196
197 // Local matrix
198 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
199
200 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
201 for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
202 LO Bkj = Bcolind(j);
203 LO Cij = Bcol2Ccol(Bkj);
204
205 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
206 // New entry
207 c_status[Cij] = CSR_ip;
208 Ccolind(CSR_ip) = Cij;
209 Cvals(CSR_ip) = Aval * Bvals(j);
210 CSR_ip++;
211
212 } else {
213 Cvals(c_status[Cij]) += Aval * Bvals(j);
214 }
215 }
216
217 } else {
218 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
219 // corresponding to the current entry of A NOT populated (has
220 // a flag "invalid" value), then the corresponding row of B is
221 // in B_local (i.e., it lives on the calling process).
222
223 // Remote matrix
224 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
225 for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
226 LO Ikj = Icolind(j);
227 LO Cij = Icol2Ccol(Ikj);
228
229 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
230 // New entry
231 c_status[Cij] = CSR_ip;
232 Ccolind(CSR_ip) = Cij;
233 Cvals(CSR_ip) = Aval * Ivals(j);
234 CSR_ip++;
235
236 } else {
237 Cvals(c_status[Cij]) += Aval * Ivals(j);
238 }
239 }
240 }
241 }
242
243 // Resize for next pass if needed
244 if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1)) * b_max_nnz_per_row) > CSR_alloc) {
245 CSR_alloc *= 2;
246 Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
247 Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
248 }
249 OLD_ip = CSR_ip;
250 }
251 thread_total_nnz(tid) = CSR_ip;
252 tl_colind(tid) = Ccolind;
253 tl_values(tid) = Cvals;
254 });
255
256 // Do the copy out
257 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
258
259#ifdef HAVE_TPETRA_MMM_TIMINGS
260 MM = Teuchos::null;
261 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPSort"))));
262#endif
263 // Sort & set values
264 if (params.is_null() || params->get("sort entries", true)) {
265 // Tpetra's SpGEMM results in almost sorted matrices. Use shell sort.
266 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
267 }
268 C.setAllValues(row_mapC, entriesC, valuesC);
269}
270
271/*********************************************************************************************************/
272template <class Scalar,
273 class LocalOrdinal,
274 class GlobalOrdinal,
275 class LocalOrdinalViewType>
276void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
277 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
278 const LocalOrdinalViewType& targetMapToOrigRow,
279 const LocalOrdinalViewType& targetMapToImportRow,
280 const LocalOrdinalViewType& Bcol2Ccol,
281 const LocalOrdinalViewType& Icol2Ccol,
282 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
283 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
284 const std::string& label,
285 const Teuchos::RCP<Teuchos::ParameterList>& params) {
286 // By default, if A*B results in an entry that is not supported by the graph of C, we throw.
287 // This option allows to override this behavior and silently ignores such entries.
288 bool throwOnInsert = true;
289 if (!params.is_null() && params->isType<bool>("MM Throw For Non-Existent Entries"))
290 throwOnInsert = params->get<bool>("MM Throw For Non-Existent Entries");
291
292#ifdef HAVE_TPETRA_MMM_TIMINGS
293 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
294 using Teuchos::TimeMonitor;
295 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse LTGCore"))));
296#endif
297
298 using Teuchos::Array;
299 using Teuchos::ArrayRCP;
300 using Teuchos::ArrayView;
301 using Teuchos::RCP;
302 using Teuchos::rcp;
303
304 // Lots and lots of typedefs
306 // typedef typename KCRS::device_type device_t;
307 typedef typename KCRS::StaticCrsGraphType graph_t;
308 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
309 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
310 typedef typename KCRS::values_type::non_const_type scalar_view_t;
311
312 typedef Scalar SC;
313 typedef LocalOrdinal LO;
314 typedef GlobalOrdinal GO;
315 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
316 typedef Map<LO, GO, NO> map_type;
317
318 // NOTE (mfh 15 Sep 2017) This is specifically only for
319 // execution_space = Kokkos::OpenMP, so we neither need nor want
320 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
321 typedef NO::execution_space execution_space;
322 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
323
324 // All of the invalid guys
325 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
326 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
327 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
328
329 // Grab the Kokkos::SparseCrsMatrices & inner stuff
330 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
331 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
332 const KCRS Cmat = C.getLocalMatrixDevice();
333
334 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
335 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
336 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
337 scalar_view_t Cvals = Cmat.values;
338
339 c_lno_view_t Irowptr;
340 c_lno_nnz_view_t Icolind;
341 scalar_view_t Ivals;
342 if (!Bview.importMatrix.is_null()) {
343 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
344 Irowptr = lclB.graph.row_map;
345 Icolind = lclB.graph.entries;
346 Ivals = lclB.values;
347 }
348
349 // Sizes
350 RCP<const map_type> Ccolmap = C.getColMap();
351 size_t m = Aview.origMatrix->getLocalNumRows();
352 size_t n = Ccolmap->getLocalNumElements();
353
354 // Get my node / thread info (right from openmp or parameter list)
355 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
356 if (!params.is_null()) {
357 if (params->isParameter("openmp: ltg thread max"))
358 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
359 }
360
361 double thread_chunk = (double)(m) / thread_max;
362
363 // Run chunks of the matrix independently
364 Kokkos::parallel_for("MMM::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
365 // Thread coordination stuff
366 size_t my_thread_start = tid * thread_chunk;
367 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
368
369 // Allocations
370 std::vector<size_t> c_status(n, INVALID);
371
372 // For each row of A/C
373 size_t CSR_ip = 0, OLD_ip = 0;
374 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
375 // First fill the c_status array w/ locations where we're allowed to
376 // generate nonzeros for this row
377 OLD_ip = Crowptr(i);
378 CSR_ip = Crowptr(i + 1);
379 for (size_t k = OLD_ip; k < CSR_ip; k++) {
380 c_status[Ccolind(k)] = k;
381 // Reset values in the row of C
382 Cvals(k) = SC_ZERO;
383 }
384
385 for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
386 LO Aik = Acolind(k);
387 const SC Aval = Avals(k);
388 if (Aval == SC_ZERO)
389 continue;
390
391 if (targetMapToOrigRow(Aik) != LO_INVALID) {
392 // Local matrix
393 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
394
395 for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
396 LO Bkj = Bcolind(j);
397 LO Cij = Bcol2Ccol(Bkj);
398
399 const bool badInsert = c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip;
400 if (!badInsert)
401 Cvals(c_status[Cij]) += Aval * Bvals(j);
402 else if (throwOnInsert)
403 TEUCHOS_TEST_FOR_EXCEPTION(badInsert,
404 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
405 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
406 }
407 } else {
408 // Remote matrix
409 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
410 for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
411 LO Ikj = Icolind(j);
412 LO Cij = Icol2Ccol(Ikj);
413
414 const bool badInsert = c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip;
415 if (!badInsert)
416 Cvals(c_status[Cij]) += Aval * Ivals(j);
417 else if (throwOnInsert)
418 TEUCHOS_TEST_FOR_EXCEPTION(badInsert,
419 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
420 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
421 }
422 }
423 }
424 }
425 });
426
427 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
428}
429
430/*********************************************************************************************************/
431template <class Scalar,
432 class LocalOrdinal,
433 class GlobalOrdinal,
434 class LocalOrdinalViewType>
435void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
436 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
437 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
438 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
439 const LocalOrdinalViewType& targetMapToOrigRow,
440 const LocalOrdinalViewType& targetMapToImportRow,
441 const LocalOrdinalViewType& Bcol2Ccol,
442 const LocalOrdinalViewType& Icol2Ccol,
443 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
444 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
445 const std::string& label,
446 const Teuchos::RCP<Teuchos::ParameterList>& params) {
447#ifdef HAVE_TPETRA_MMM_TIMINGS
448 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
449 using Teuchos::TimeMonitor;
450 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix LTGCore"))));
451#endif
452
453 using Teuchos::Array;
454 using Teuchos::ArrayRCP;
455 using Teuchos::ArrayView;
456 using Teuchos::RCP;
457 using Teuchos::rcp;
458
459 // Lots and lots of typedefs
460 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
462 typedef typename KCRS::device_type device_t;
463 typedef typename KCRS::StaticCrsGraphType graph_t;
464 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
465 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
466 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
467 typedef typename KCRS::values_type::non_const_type scalar_view_t;
468
469 // Unmanaged versions of the above
470 // typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
471 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
472 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
473
474 // Jacobi-specific
475 typedef typename scalar_view_t::memory_space scalar_memory_space;
476
477 typedef Scalar SC;
478 typedef LocalOrdinal LO;
479 typedef GlobalOrdinal GO;
480 typedef Node NO;
481 typedef Map<LO, GO, NO> map_type;
482
483 // NOTE (mfh 15 Sep 2017) This is specifically only for
484 // execution_space = Kokkos::OpenMP, so we neither need nor want
485 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
486 typedef NO::execution_space execution_space;
487 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
488
489 // All of the invalid guys
490 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
491 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
492 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
493
494 // Grab the Kokkos::SparseCrsMatrices & inner stuff
495 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
496 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
497
498 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
499 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
500 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
501 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
502
503 c_lno_view_t Irowptr;
504 lno_nnz_view_t Icolind;
505 scalar_view_t Ivals;
506 if (!Bview.importMatrix.is_null()) {
507 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
508 Irowptr = lclB.graph.row_map;
509 Icolind = lclB.graph.entries;
510 Ivals = lclB.values;
511 b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
512 }
513
514 // Jacobi-specific inner stuff
515 auto Dvals =
516 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
517
518 // Sizes
519 RCP<const map_type> Ccolmap = C.getColMap();
520 size_t m = Aview.origMatrix->getLocalNumRows();
521 size_t n = Ccolmap->getLocalNumElements();
522 size_t Cest_nnz_per_row = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
523
524 // Get my node / thread info (right from openmp)
525 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
526 if (!params.is_null()) {
527 if (params->isParameter("openmp: ltg thread max"))
528 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
529 }
530
531 // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
532 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
533 // we will not touch these until the final copyout step
534 lno_nnz_view_t entriesC;
535 scalar_view_t valuesC;
536
537 // add this, since we do the rowptr in place, we could figure this out
538 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
539 lno_nnz_view_t thread_total_nnz("thread_total_nnz", thread_max + 1);
540
541 // Thread-local memory
542 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
543 Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
544
545 double thread_chunk = (double)(m) / thread_max;
546
547 // Run chunks of the matrix independently
548 Kokkos::parallel_for("Jacobi::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
549 // Thread coordination stuff
550 size_t my_thread_start = tid * thread_chunk;
551 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
552 size_t my_thread_m = my_thread_stop - my_thread_start;
553
554 // Size estimate
555 size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
556
557 // Allocations
558 std::vector<size_t> c_status(n, INVALID);
559
560 u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
561 u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
562
563 // For each row of A/C
564 size_t CSR_ip = 0, OLD_ip = 0;
565 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
566 // printf("CMS: row %d CSR_alloc = %d\n",(int)i,(int)CSR_alloc);fflush(stdout);
567 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
568 // on the calling process.
569 // JJE: directly access the rowptr here (indexed using our loop var)
570 row_mapC(i) = CSR_ip;
571 // NOTE: Vector::getLocalView returns a rank 2 view here
572 SC minusOmegaDval = -omega * Dvals(i, 0);
573
574 // Entries of B
575 for (size_t j = Browptr(i); j < Browptr(i + 1); j++) {
576 const SC Bval = Bvals(j);
577 if (Bval == SC_ZERO)
578 continue;
579 LO Bij = Bcolind(j);
580 LO Cij = Bcol2Ccol(Bij);
581
582 // Assume no repeated entries in B
583 c_status[Cij] = CSR_ip;
584 Ccolind(CSR_ip) = Cij;
585 Cvals(CSR_ip) = Bvals[j];
586 CSR_ip++;
587 }
588
589 // Entries of -omega * Dinv * A * B
590 // mfh 27 Sep 2016: For each entry of A in the current row of A
591 for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
592 LO Aik = Acolind(k); // local column index of current entry of A
593 const SC Aval = Avals(k); // value of current entry of A
594 if (Aval == SC_ZERO)
595 continue; // skip explicitly stored zero values in A
596
597 if (targetMapToOrigRow(Aik) != LO_INVALID) {
598 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
599 // corresponding to the current entry of A is populated, then
600 // the corresponding row of B is in B_local (i.e., it lives on
601 // the calling process).
602
603 // Local matrix
604 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
605
606 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
607 for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
608 LO Bkj = Bcolind(j);
609 LO Cij = Bcol2Ccol(Bkj);
610
611 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
612 // New entry
613 c_status[Cij] = CSR_ip;
614 Ccolind(CSR_ip) = Cij;
615 Cvals(CSR_ip) = minusOmegaDval * Aval * Bvals(j);
616 CSR_ip++;
617
618 } else {
619 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
620 }
621 }
622
623 } else {
624 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
625 // corresponding to the current entry of A NOT populated (has
626 // a flag "invalid" value), then the corresponding row of B is
627 // in B_local (i.e., it lives on the calling process).
628
629 // Remote matrix
630 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
631 for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
632 LO Ikj = Icolind(j);
633 LO Cij = Icol2Ccol(Ikj);
634
635 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
636 // New entry
637 c_status[Cij] = CSR_ip;
638 Ccolind(CSR_ip) = Cij;
639 Cvals(CSR_ip) = minusOmegaDval * Aval * Ivals(j);
640 CSR_ip++;
641
642 } else {
643 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
644 }
645 }
646 }
647 }
648
649 // Resize for next pass if needed
650 if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1) + 1) * b_max_nnz_per_row) > CSR_alloc) {
651 CSR_alloc *= 2;
652 Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
653 Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
654 }
655 OLD_ip = CSR_ip;
656 }
657 thread_total_nnz(tid) = CSR_ip;
658 tl_colind(tid) = Ccolind;
659 tl_values(tid) = Cvals;
660 });
661
662 // Do the copy out (removed the tl_rowptr!)
663 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
664
665#ifdef HAVE_TPETRA_MMM_TIMINGS
666 MM = Teuchos::null;
667 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix OpenMPSort"))));
668#endif
669 // Sort & set values
670 if (params.is_null() || params->get("sort entries", true)) {
671 // Tpetra's SpGEMM results in almost sorted matrices. Use shell sort.
672 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
673 }
674 C.setAllValues(row_mapC, entriesC, valuesC);
675}
676
677/*********************************************************************************************************/
678template <class Scalar,
679 class LocalOrdinal,
680 class GlobalOrdinal,
681 class LocalOrdinalViewType>
682void jacobi_A_B_reuse_LowThreadGustavsonKernel(typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
683 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
684 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
685 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
686 const LocalOrdinalViewType& targetMapToOrigRow,
687 const LocalOrdinalViewType& targetMapToImportRow,
688 const LocalOrdinalViewType& Bcol2Ccol,
689 const LocalOrdinalViewType& Icol2Ccol,
690 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
691 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
692 const std::string& label,
693 const Teuchos::RCP<Teuchos::ParameterList>& params) {
694#ifdef HAVE_TPETRA_MMM_TIMINGS
695 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
696 using Teuchos::TimeMonitor;
697 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse LTGCore"))));
698#endif
699 using Teuchos::Array;
700 using Teuchos::ArrayRCP;
701 using Teuchos::ArrayView;
702 using Teuchos::RCP;
703 using Teuchos::rcp;
704
705 // Lots and lots of typedefs
706 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
708 // typedef typename KCRS::device_type device_t;
709 typedef typename KCRS::StaticCrsGraphType graph_t;
710 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
711 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
712 typedef typename KCRS::values_type::non_const_type scalar_view_t;
713
714 // Jacobi-specific
715 typedef typename scalar_view_t::memory_space scalar_memory_space;
716
717 typedef Scalar SC;
718 typedef LocalOrdinal LO;
719 typedef GlobalOrdinal GO;
720 typedef Node NO;
721 typedef Map<LO, GO, NO> map_type;
722
723 // NOTE (mfh 15 Sep 2017) This is specifically only for
724 // execution_space = Kokkos::OpenMP, so we neither need nor want
725 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
726 typedef NO::execution_space execution_space;
727 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
728
729 // All of the invalid guys
730 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
731 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
732 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
733
734 // Grab the Kokkos::SparseCrsMatrices & inner stuff
735 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
736 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
737 const KCRS Cmat = C.getLocalMatrixDevice();
738
739 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
740 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
741 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
742 scalar_view_t Cvals = Cmat.values;
743
744 c_lno_view_t Irowptr;
745 c_lno_nnz_view_t Icolind;
746 scalar_view_t Ivals;
747 if (!Bview.importMatrix.is_null()) {
748 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
749 Irowptr = lclB.graph.row_map;
750 Icolind = lclB.graph.entries;
751 Ivals = lclB.values;
752 }
753
754 // Jacobi-specific inner stuff
755 auto Dvals =
756 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
757
758 // Sizes
759 RCP<const map_type> Ccolmap = C.getColMap();
760 size_t m = Aview.origMatrix->getLocalNumRows();
761 size_t n = Ccolmap->getLocalNumElements();
762
763 // Get my node / thread info (right from openmp or parameter list)
764 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
765 if (!params.is_null()) {
766 if (params->isParameter("openmp: ltg thread max"))
767 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
768 }
769
770 double thread_chunk = (double)(m) / thread_max;
771
772 // Run chunks of the matrix independently
773 Kokkos::parallel_for("Jacobi::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
774 // Thread coordination stuff
775 size_t my_thread_start = tid * thread_chunk;
776 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
777
778 // Allocations
779 std::vector<size_t> c_status(n, INVALID);
780
781 // For each row of A/C
782 size_t CSR_ip = 0, OLD_ip = 0;
783 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
784 // First fill the c_status array w/ locations where we're allowed to
785 // generate nonzeros for this row
786 OLD_ip = Crowptr(i);
787 CSR_ip = Crowptr(i + 1);
788 // NOTE: Vector::getLocalView returns a rank 2 view here
789 SC minusOmegaDval = -omega * Dvals(i, 0);
790
791 for (size_t k = OLD_ip; k < CSR_ip; k++) {
792 c_status[Ccolind(k)] = k;
793 // Reset values in the row of C
794 Cvals(k) = SC_ZERO;
795 }
796
797 // Entries of B
798 for (size_t j = Browptr(i); j < Browptr(i + 1); j++) {
799 const SC Bval = Bvals(j);
800 if (Bval == SC_ZERO)
801 continue;
802 LO Bij = Bcolind(j);
803 LO Cij = Bcol2Ccol(Bij);
804
805 // Assume no repeated entries in B
806 Cvals(c_status[Cij]) += Bvals(j);
807 CSR_ip++;
808 }
809
810 for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
811 LO Aik = Acolind(k);
812 const SC Aval = Avals(k);
813 if (Aval == SC_ZERO)
814 continue;
815
816 if (targetMapToOrigRow(Aik) != LO_INVALID) {
817 // Local matrix
818 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
819
820 for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
821 LO Bkj = Bcolind(j);
822 LO Cij = Bcol2Ccol(Bkj);
823
824 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
825 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
826 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
827
828 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
829 }
830 } else {
831 // Remote matrix
832 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
833 for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
834 LO Ikj = Icolind(j);
835 LO Cij = Icol2Ccol(Ikj);
836
837 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
838 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
839 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
840
841 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
842 }
843 }
844 }
845 }
846 });
847
848 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
849}
850
851/*********************************************************************************************************/
852template <class InColindArrayType, class InValsArrayType,
853 class OutRowptrType, class OutColindType, class OutValsType>
854void copy_out_from_thread_memory(const OutColindType& thread_total_nnz,
855 const InColindArrayType& Incolind,
856 const InValsArrayType& Invalues,
857 const size_t m,
858 const double thread_chunk,
859 OutRowptrType& row_mapC,
860 OutColindType& entriesC,
861 OutValsType& valuesC) {
862 typedef OutRowptrType lno_view_t;
863 typedef OutColindType lno_nnz_view_t;
864 typedef OutValsType scalar_view_t;
865 typedef typename lno_view_t::execution_space execution_space;
866 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
867
868 // Generate the starting nnz number per thread
869 size_t thread_max = Incolind.size();
870 size_t c_nnz_size = 0;
871 // since this kernel is 'low thread count' is very likely, that we could
872 // sum over the thread_total_nnz and not go parallel, but since it is a view
873 // we don't know where the memory actually lives... so we kinda need to go parallel
874
875 lno_view_t thread_start_nnz("thread_nnz", thread_max + 1);
876 Kokkos::parallel_scan("LTG::Scan", range_type(0, thread_max).set_chunk_size(1), [=](const size_t i, size_t& update, const bool final) {
877 size_t mynnz = thread_total_nnz(i);
878 if (final) thread_start_nnz(i) = update;
879 update += mynnz;
880 if (final && i + 1 == thread_max) thread_start_nnz(i + 1) = update;
881 });
882 c_nnz_size = thread_start_nnz(thread_max);
883
884 // 2019 Apr 10 JJE: update the rowptr's final entry here
885 row_mapC(m) = thread_start_nnz(thread_max);
886
887 // Allocate output
888 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
889 entriesC = entriesC_;
890 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
891 valuesC = valuesC_;
892
893 // Copy out
894 Kokkos::parallel_for("LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
895 const size_t my_thread_start = tid * thread_chunk;
896 const size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
897 const size_t nnz_thread_start = thread_start_nnz(tid);
898 // we need this info, since we did the rowptr in place
899 const size_t nnz_thread_stop = thread_start_nnz(tid + 1);
900
901 // There are two fundamental operations:
902 // * Updateing the rowptr with the correct offset
903 // * copying entries and values
904
905 // First, update the rowptr, it since we did it inplace, it is a += operation
906 // in the paper, I experimented with a coloring scheme that had threads do
907 // do their copies in different orders. It wasn't obvious it was beneficial
908 // but it can be replicated, by choosing the array to copy first based on your
909 // thread id % 3
910 if (my_thread_start != 0) {
911 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
912 row_mapC(i) += nnz_thread_start;
913 }
914 }
915
916 // try to Kokkos::single() the alloc here. It should implicitly barrier
917 // thread 0 doesn't copy the rowptr, so it could hit the single first
918 // in the paper, I played a game that effectively got LTG down to a single
919 // OpenMP barrier. But doing that requires the ability to spawn a single
920 // parallel region. The Scan above was implemented using atomic adds
921 // and the barrier was only needed so you could allocate
922 //
923 // Since we can't spawn a single region, we could move the allocations
924 // here, and using 'single'. Most likely, thread 0 will hit it first allowing
925 // the other threads to update the rowptr while it allocates.
926
927 // next, bulk copy the vals/colind arrays
928 const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
929 for (size_t i = 0; i < my_num_nnz; ++i) {
930 entriesC(nnz_thread_start + i) = Incolind(tid)(i);
931 valuesC(nnz_thread_start + i) = Invalues(tid)(i);
932 }
933
934 // Free the unamanged views, let each thread deallocate its memory
935 // May need to cast away const here..
936 if (Incolind(tid).data()) free(Incolind(tid).data());
937 if (Invalues(tid).data()) free(Invalues(tid).data());
938 });
939} // end copy_out
940
941#endif // OpenMP
942
943/*********************************************************************************************************/
944template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
945void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
946 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
947 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
948 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
949 const LocalOrdinalViewType& Acol2Brow,
950 const LocalOrdinalViewType& Acol2Irow,
951 const LocalOrdinalViewType& Bcol2Ccol,
952 const LocalOrdinalViewType& Icol2Ccol,
953 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
954 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > Cimport,
955 const std::string& label,
956 const Teuchos::RCP<Teuchos::ParameterList>& params) {
957#ifdef HAVE_TPETRA_MMM_TIMINGS
958 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
959 using Teuchos::TimeMonitor;
960 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK"))));
961 Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Multiply"))));
962 using Teuchos::rcp;
963#endif
964 typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
965
966 // This kernel computes (I-omega Dinv A) B the slow way (for generality's sake, you must understand)
967
968 // 1) Multiply A*B
969 Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(new Matrix_t(C.getRowMap(), 0));
970 Tpetra::MMdetails::mult_A_B_newmatrix(Aview, Bview, *AB, label + std::string(" MSAK"), params);
971
972#ifdef HAVE_TPETRA_MMM_TIMINGS
973 MM2 = Teuchos::null;
974 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Scale"))));
975#endif
976
977 // 2) Scale A by Dinv
978 AB->leftScale(Dinv);
979
980#ifdef HAVE_TPETRA_MMM_TIMINGS
981 MM2 = Teuchos::null;
982 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Add"))));
983#endif
984
985 // 3) Add [-omega Dinv A] + B
986 Teuchos::ParameterList jparams;
987 if (!params.is_null()) {
988 jparams = *params;
989 jparams.set("label", label + std::string(" MSAK Add"));
990 }
991 Scalar one = Teuchos::ScalarTraits<Scalar>::one();
992 const Scalar negJacobiCoefficient = (-omega) * one;
993 Tpetra::MatrixMatrix::add(one, false, *Bview.origMatrix, negJacobiCoefficient, false, *AB, C, AB->getDomainMap(), AB->getRangeMap(), Teuchos::rcp(&jparams, false));
994#ifdef HAVE_TPETRA_MMM_TIMINGS
995 MM2 = Teuchos::null;
996#endif
997} // jacobi_A_B_newmatrix_MultiplyScaleAddKernel
998
999#if defined(HAVE_TPETRA_INST_OPENMP)
1000/*********************************************************************************************************/
1001template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1002static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1003 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1004 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1005 const LocalOrdinalViewType& Acol2PRow,
1006 const LocalOrdinalViewType& Acol2PRowImport,
1007 const LocalOrdinalViewType& Pcol2Accol,
1008 const LocalOrdinalViewType& PIcol2Accol,
1009 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1010 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1011 const std::string& label,
1012 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1013 using Teuchos::RCP;
1014 using Tpetra::MatrixMatrix::UnmanagedView;
1015#ifdef HAVE_TPETRA_MMM_TIMINGS
1016 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1017 using Teuchos::rcp;
1018 using Teuchos::TimeMonitor;
1019 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix LTGCore"))));
1020#endif
1021
1022 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1023 typedef Scalar SC;
1024 typedef LocalOrdinal LO;
1025 typedef GlobalOrdinal GO;
1026 typedef Node NO;
1027 typedef Map<LO, GO, NO> map_type;
1029 typedef typename KCRS::StaticCrsGraphType graph_t;
1030 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1031 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1032 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1033 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1034 typedef typename KCRS::device_type device_t;
1035 typedef typename device_t::execution_space execution_space;
1036 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1037
1038 // Unmanaged versions of the above
1039 typedef UnmanagedView<lno_view_t> u_lno_view_t;
1040 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1041 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1042
1043 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1044 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1045
1046 // Sizes
1047 RCP<const map_type> Accolmap = Ac.getColMap();
1048 size_t m = Rview.origMatrix->getLocalNumRows();
1049 size_t n = Accolmap->getLocalNumElements();
1050
1051 // Get raw Kokkos matrices, and the raw CSR views
1052 const KCRS Rmat = Rview.origMatrix->getLocalMatrixDevice();
1053 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
1054 const KCRS Pmat = Pview.origMatrix->getLocalMatrixDevice();
1055
1056 c_lno_view_t Rrowptr = Rmat.graph.row_map,
1057 Arowptr = Amat.graph.row_map,
1058 Prowptr = Pmat.graph.row_map, Irowptr;
1059 const lno_nnz_view_t Rcolind = Rmat.graph.entries,
1060 Acolind = Amat.graph.entries,
1061 Pcolind = Pmat.graph.entries;
1062 lno_nnz_view_t Icolind;
1063 const scalar_view_t Rvals = Rmat.values,
1064 Avals = Amat.values,
1065 Pvals = Pmat.values;
1066 scalar_view_t Ivals;
1067
1068 if (!Pview.importMatrix.is_null()) {
1069 const KCRS Imat = Pview.importMatrix->getLocalMatrixDevice();
1070 Irowptr = Imat.graph.row_map;
1071 Icolind = Imat.graph.entries;
1072 Ivals = Imat.values;
1073 }
1074
1075 // Classic csr assembly (low memory edition)
1076 //
1077 // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
1078 // The method loops over rows of R, and may resize after processing
1079 // each row. Chris Siefert says that this reflects experience in
1080 // ML; for the non-threaded case, ML found it faster to spend less
1081 // effort on estimation and risk an occasional reallocation.
1082
1083 size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1084 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1085
1086 // Get my node / thread info (right from openmp or parameter list)
1087 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1088 if (!params.is_null()) {
1089 if (params->isParameter("openmp: ltg thread max"))
1090 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
1091 }
1092
1093 double thread_chunk = (double)(m) / thread_max;
1094
1095 // we can construct the rowptr inplace, allocate here and fault in parallel
1096 lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
1097 // we do not touch these until copy out
1098 lno_nnz_view_t entriesAc;
1099 scalar_view_t valuesAc;
1100
1101 // add this, since we do the rowptr in place, we could figure this out
1102 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
1103 lno_nnz_view_t thread_total_nnz("thread_total_nnz", thread_max + 1);
1104
1105 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1106 // routine. The routine computes Ac := R * A * (P_local + P_remote).
1107 //
1108 // For column index Aik in row i of A, Acol2PRow[Aik] tells
1109 // you whether the corresponding row of P belongs to P_local
1110 // ("orig") or P_remote ("Import").
1111
1112 // Thread-local memory
1113 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
1114 Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
1115
1116 // For each row of R
1117 Kokkos::parallel_for("MMM::RAP::NewMatrix::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
1118 // Thread coordination stuff
1119 size_t my_thread_start = tid * thread_chunk;
1120 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
1121 size_t my_thread_m = my_thread_stop - my_thread_start;
1122
1123 size_t nnzAllocated = (size_t)(my_thread_m * Acest_nnz_per_row + 100);
1124
1125 std::vector<size_t> ac_status(n, INVALID);
1126
1127 // manually allocate the thread-local storage for Ac
1128 u_lno_view_t Acrowptr((typename u_lno_view_t::data_type)malloc(u_lno_view_t::shmem_size(my_thread_m + 1)), my_thread_m + 1);
1129 u_lno_nnz_view_t Accolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1130 u_scalar_view_t Acvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1131
1132 // count entries as they are added to Ac
1133 size_t nnz = 0, nnz_old = 0;
1134 // bmk: loop over the rows of R which are assigned to thread tid
1135 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1136 // directly index into the rowptr
1137 rowmapAc(i) = nnz;
1138 // mfh 27 Sep 2016: For each entry of R in the current row of R
1139 for (size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
1140 LO k = Rcolind(kk); // local column index of current entry of R
1141 const SC Rik = Rvals(kk); // value of current entry of R
1142 if (Rik == SC_ZERO)
1143 continue; // skip explicitly stored zero values in R
1144 // For each entry of A in the current row of A
1145 for (size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
1146 LO l = Acolind(ll); // local column index of current entry of A
1147 const SC Akl = Avals(ll); // value of current entry of A
1148 if (Akl == SC_ZERO)
1149 continue; // skip explicitly stored zero values in A
1150
1151 if (Acol2PRow[l] != LO_INVALID) {
1152 // mfh 27 Sep 2016: If the entry of Acol2PRow
1153 // corresponding to the current entry of A is populated, then
1154 // the corresponding row of P is in P_local (i.e., it lives on
1155 // the calling process).
1156
1157 // Local matrix
1158 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1159
1160 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1161 for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
1162 LO j = Pcolind(jj);
1163 LO Acj = Pcol2Accol(j);
1164 SC Plj = Pvals(jj);
1165
1166 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1167#ifdef HAVE_TPETRA_DEBUG
1168 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1169 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1170 std::runtime_error,
1171 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1172#endif
1173 // New entry
1174 ac_status[Acj] = nnz;
1175 Accolind(nnz) = Acj;
1176 Acvals(nnz) = Rik * Akl * Plj;
1177 nnz++;
1178 } else {
1179 Acvals(ac_status[Acj]) += Rik * Akl * Plj;
1180 }
1181 }
1182 } else {
1183 // mfh 27 Sep 2016: If the entry of Acol2PRow
1184 // corresponding to the current entry of A is NOT populated (has
1185 // a flag "invalid" value), then the corresponding row of P is
1186 // in P_remote (i.e., it does not live on the calling process).
1187
1188 // Remote matrix
1189 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1190 for (size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
1191 LO j = Icolind(jj);
1192 LO Acj = PIcol2Accol(j);
1193 SC Plj = Ivals(jj);
1194
1195 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1196#ifdef HAVE_TPETRA_DEBUG
1197 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1198 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1199 std::runtime_error,
1200 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1201#endif
1202 // New entry
1203 ac_status[Acj] = nnz;
1204 Accolind(nnz) = Acj;
1205 Acvals(nnz) = Rik * Akl * Plj;
1206 nnz++;
1207 } else {
1208 Acvals(ac_status[Acj]) += Rik * Akl * Plj;
1209 }
1210 }
1211 }
1212 }
1213 }
1214 // Resize for next pass if needed
1215 if (nnz + n > nnzAllocated) {
1216 nnzAllocated *= 2;
1217 Accolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1218 Acvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*)Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1219 }
1220 nnz_old = nnz;
1221 }
1222 thread_total_nnz(tid) = nnz;
1223 tl_colind(tid) = Accolind;
1224 tl_values(tid) = Acvals;
1225 });
1226#ifdef HAVE_TPETRA_MMM_TIMINGS
1227 MM = Teuchos::null;
1228 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix copy from thread local"))));
1229#endif
1230
1231 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1232
1233#ifdef HAVE_TPETRA_MMM_TIMINGS
1234 MM = Teuchos::null;
1235 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1236#endif
1237
1238 // Final sort & set of CRS arrays
1239 // Tpetra's SpGEMM results in almost sorted matrices. Use shell sort.
1240 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1241 // mfh 27 Sep 2016: This just sets pointers.
1242 Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1243
1244#ifdef HAVE_TPETRA_MMM_TIMINGS
1245 MM = Teuchos::null;
1246 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1247#endif
1248
1249 // Final FillComplete
1250 //
1251 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1252 // Import (from domain Map to column Map) construction (which costs
1253 // lots of communication) by taking the previously constructed
1254 // Import object. We should be able to do this without interfering
1255 // with the implementation of the local part of sparse matrix-matrix
1256 // multiply above.
1257 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1258 labelList->set("Timer Label", label);
1259 if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
1260 RCP<const Export<LO, GO, NO> > dummyExport;
1261 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1262 Rview.origMatrix->getRangeMap(),
1263 Acimport,
1264 dummyExport,
1265 labelList);
1266}
1267
1268/*********************************************************************************************************/
1269template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1270static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1271 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1272 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1273 const LocalOrdinalViewType& Acol2PRow,
1274 const LocalOrdinalViewType& Acol2PRowImport,
1275 const LocalOrdinalViewType& Pcol2Accol,
1276 const LocalOrdinalViewType& PIcol2Accol,
1277 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1278 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1279 const std::string& label,
1280 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1281 using Teuchos::RCP;
1282 using Tpetra::MatrixMatrix::UnmanagedView;
1283#ifdef HAVE_TPETRA_MMM_TIMINGS
1284 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1285 using Teuchos::rcp;
1286 using Teuchos::TimeMonitor;
1287 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse LTGCore"))));
1288#endif
1289
1290 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1291 typedef Scalar SC;
1292 typedef LocalOrdinal LO;
1293 typedef GlobalOrdinal GO;
1294 typedef Node NO;
1295 typedef Map<LO, GO, NO> map_type;
1297 typedef typename KCRS::StaticCrsGraphType graph_t;
1298 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1299 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1300 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1301 typedef typename KCRS::device_type device_t;
1302 typedef typename device_t::execution_space execution_space;
1303 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1304
1305 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1306 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1307 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1308
1309 // Sizes
1310 RCP<const map_type> Accolmap = Ac.getColMap();
1311 size_t m = Rview.origMatrix->getLocalNumRows();
1312 size_t n = Accolmap->getLocalNumElements();
1313
1314 // Get raw Kokkos matrices, and the raw CSR views
1315 const KCRS Rmat = Rview.origMatrix->getLocalMatrixDevice();
1316 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
1317 const KCRS Pmat = Pview.origMatrix->getLocalMatrixDevice();
1318 const KCRS Cmat = Ac.getLocalMatrixDevice();
1319
1320 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1321 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1322 lno_nnz_view_t Icolind;
1323 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1324 scalar_view_t Cvals = Cmat.values;
1325 scalar_view_t Ivals;
1326
1327 if (!Pview.importMatrix.is_null()) {
1328 const KCRS Imat = Pview.importMatrix->getLocalMatrixDevice();
1329 Irowptr = Imat.graph.row_map;
1330 Icolind = Imat.graph.entries;
1331 Ivals = Imat.values;
1332 }
1333
1334 // Get my node / thread info (right from openmp or parameter list)
1335 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1336 if (!params.is_null()) {
1337 if (params->isParameter("openmp: ltg thread max"))
1338 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
1339 }
1340
1341 double thread_chunk = (double)(m) / thread_max;
1342
1343 // For each row of R
1344 Kokkos::parallel_for("MMM::RAP::Reuse::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
1345 // Thread coordination stuff
1346 size_t my_thread_start = tid * thread_chunk;
1347 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
1348
1349 std::vector<size_t> c_status(n, INVALID);
1350
1351 // count entries as they are added to Ac
1352 size_t OLD_ip = 0, CSR_ip = 0;
1353 // bmk: loop over the rows of R which are assigned to thread tid
1354 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1355 // First fill the c_status array w/ locations where we're allowed to
1356 // generate nonzeros for this row
1357 OLD_ip = Crowptr(i);
1358 CSR_ip = Crowptr(i + 1);
1359 for (size_t k = OLD_ip; k < CSR_ip; k++) {
1360 c_status[Ccolind(k)] = k;
1361 // Reset values in the row of C
1362 Cvals(k) = SC_ZERO;
1363 }
1364
1365 // mfh 27 Sep 2016: For each entry of R in the current row of R
1366 for (size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
1367 LO k = Rcolind(kk); // local column index of current entry of R
1368 const SC Rik = Rvals(kk); // value of current entry of R
1369 if (Rik == SC_ZERO)
1370 continue; // skip explicitly stored zero values in R
1371 // For each entry of A in the current row of A
1372 for (size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
1373 LO l = Acolind(ll); // local column index of current entry of A
1374 const SC Akl = Avals(ll); // value of current entry of A
1375 if (Akl == SC_ZERO)
1376 continue; // skip explicitly stored zero values in A
1377
1378 if (Acol2PRow[l] != LO_INVALID) {
1379 // mfh 27 Sep 2016: If the entry of Acol2PRow
1380 // corresponding to the current entry of A is populated, then
1381 // the corresponding row of P is in P_local (i.e., it lives on
1382 // the calling process).
1383
1384 // Local matrix
1385 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1386
1387 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1388 for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
1389 LO j = Pcolind(jj);
1390 LO Cij = Pcol2Accol(j);
1391 SC Plj = Pvals(jj);
1392 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1393 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
1394 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1395
1396 Cvals(c_status[Cij]) += Rik * Akl * Plj;
1397 }
1398 } else {
1399 // mfh 27 Sep 2016: If the entry of Acol2PRow
1400 // corresponding to the current entry of A is NOT populated (has
1401 // a flag "invalid" value), then the corresponding row of P is
1402 // in P_remote (i.e., it does not live on the calling process).
1403
1404 // Remote matrix
1405 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1406 for (size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
1407 LO j = Icolind(jj);
1408 LO Cij = PIcol2Accol(j);
1409 SC Plj = Ivals(jj);
1410 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1411 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
1412 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1413
1414 Cvals(c_status[Cij]) += Rik * Akl * Plj;
1415 }
1416 }
1417 }
1418 }
1419 }
1420 });
1421 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
1422}
1423
1424#endif
1425
1426} // namespace ExtraKernels
1427} // namespace MatrixMatrix
1428} // namespace Tpetra
1429
1430#endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
Namespace Tpetra contains the class and methods constituting the Tpetra library.