Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_TripleMatrixMultiply_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
11#define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
12
14#include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp" //for UnmanagedView
15#include "Teuchos_VerboseObject.hpp"
16#include "Teuchos_Array.hpp"
17#include "Tpetra_Util.hpp"
18#include "Tpetra_ConfigDefs.hpp"
19#include "Tpetra_CrsMatrix.hpp"
21#include "Tpetra_RowMatrixTransposer.hpp"
22#include "Tpetra_ConfigDefs.hpp"
23#include "Tpetra_Map.hpp"
24#include "Tpetra_Export.hpp"
27#include <algorithm>
28#include <cmath>
29#include "Teuchos_FancyOStream.hpp"
30// #include "KokkosSparse_spgemm.hpp"
31
37/*********************************************************************************************************/
38// Include the architecture-specific kernel partial specializations here
39// NOTE: This needs to be outside all namespaces
40#include "TpetraExt_MatrixMatrix_OpenMP.hpp"
41#include "TpetraExt_MatrixMatrix_Cuda.hpp"
42#include "TpetraExt_MatrixMatrix_HIP.hpp"
43#include "TpetraExt_MatrixMatrix_SYCL.hpp"
44
45namespace Tpetra {
46
47namespace TripleMatrixMultiply {
48
49//
50// This method forms the matrix-matrix product Ac = op(R) * op(A) * op(P), where
51// op(A) == A if transposeA is false,
52// op(A) == A^T if transposeA is true,
53// and similarly for op(R) and op(P).
54//
55template <class Scalar,
56 class LocalOrdinal,
57 class GlobalOrdinal,
58 class Node>
61 bool transposeR,
63 bool transposeA,
65 bool transposeP,
68 const std::string& label,
69 const Teuchos::RCP<Teuchos::ParameterList>& params) {
70 using Teuchos::null;
71 using Teuchos::RCP;
72 typedef Scalar SC;
73 typedef LocalOrdinal LO;
74 typedef GlobalOrdinal GO;
75 typedef Node NO;
76 typedef CrsMatrix<SC, LO, GO, NO> crs_matrix_type;
77 typedef Import<LO, GO, NO> import_type;
78 typedef Export<LO, GO, NO> export_type;
80 typedef Map<LO, GO, NO> map_type;
82
83#ifdef HAVE_TPETRA_MMM_TIMINGS
84 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
85 using Teuchos::TimeMonitor;
86 RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Setup"))));
87#endif
88
89 const std::string prefix = "TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
90
91 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
92
93 // The input matrices R, A and P must both be fillComplete.
94 TEUCHOS_TEST_FOR_EXCEPTION(!R.isFillComplete(), std::runtime_error, prefix << "Matrix R is not fill complete.");
95 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
96 TEUCHOS_TEST_FOR_EXCEPTION(!P.isFillComplete(), std::runtime_error, prefix << "Matrix P is not fill complete.");
97
98 // If transposeA is true, then Rprime will be the transpose of R
99 // (computed explicitly via RowMatrixTransposer). Otherwise, Rprime
100 // will just be a pointer to R.
102 // If transposeA is true, then Aprime will be the transpose of A
103 // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
104 // will just be a pointer to A.
106 // If transposeB is true, then Pprime will be the transpose of P
107 // (computed explicitly via RowMatrixTransposer). Otherwise, Pprime
108 // will just be a pointer to P.
110
111 // Is this a "clean" matrix?
112 //
113 // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
114 // locally nor globally indexed, then it was empty. I don't like
115 // this, because the most straightforward implementation presumes
116 // lazy allocation of indices. However, historical precedent
117 // demands that we keep around this predicate as a way to test
118 // whether the matrix is empty.
119 const bool newFlag = !Ac.getGraph()->isLocallyIndexed() && !Ac.getGraph()->isGloballyIndexed();
120
121 using Teuchos::ParameterList;
123 transposeParams->set("sort", false);
124
125 if (transposeR && &R != &P) {
127 Rprime = transposer.createTranspose(transposeParams);
128 } else {
130 }
131
132 if (transposeA) {
134 Aprime = transposer.createTranspose(transposeParams);
135 } else {
137 }
138
139 if (transposeP) {
141 Pprime = transposer.createTranspose(transposeParams);
142 } else {
144 }
145
146 // Check size compatibility
147 global_size_t numRCols = R.getDomainMap()->getGlobalNumElements();
148 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
149 global_size_t numPCols = P.getDomainMap()->getGlobalNumElements();
150 global_size_t Rleft = transposeR ? numRCols : R.getGlobalNumRows();
151 global_size_t Rright = transposeR ? R.getGlobalNumRows() : numRCols;
152 global_size_t Aleft = transposeA ? numACols : A.getGlobalNumRows();
153 global_size_t Aright = transposeA ? A.getGlobalNumRows() : numACols;
154 global_size_t Pleft = transposeP ? numPCols : P.getGlobalNumRows();
155 global_size_t Pright = transposeP ? P.getGlobalNumRows() : numPCols;
156 TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
157 prefix << "ERROR, inner dimensions of op(R) and op(A) "
158 "must match for matrix-matrix product. op(R) is "
159 << Rleft << "x" << Rright << ", op(A) is " << Aleft << "x" << Aright);
160
161 TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
162 prefix << "ERROR, inner dimensions of op(A) and op(P) "
163 "must match for matrix-matrix product. op(A) is "
164 << Aleft << "x" << Aright << ", op(P) is " << Pleft << "x" << Pright);
165
166 // The result matrix Ac must at least have a row-map that reflects the correct
167 // row-size. Don't check the number of columns because rectangular matrices
168 // which were constructed with only one map can still end up having the
169 // correct capacity and dimensions when filled.
170 TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.getGlobalNumRows(), std::runtime_error,
171 prefix << "ERROR, dimensions of result Ac must "
172 "match dimensions of op(R) * op(A) * op(P). Ac has "
173 << Ac.getGlobalNumRows()
174 << " rows, should have at least " << Rleft << std::endl);
175
176 // It doesn't matter whether Ac is already Filled or not. If it is already
177 // Filled, it must have space allocated for the positions that will be
178 // referenced in forming Ac = op(R)*op(A)*op(P). If it doesn't have enough space,
179 // we'll error out later when trying to store result values.
180
181 // CGB: However, matrix must be in active-fill
182 TEUCHOS_TEST_FOR_EXCEPT(Ac.isFillActive() == false);
183
184 // We're going to need to import remotely-owned sections of P if
185 // more than one processor is performing this run, depending on the scenario.
186 int numProcs = P.getComm()->getSize();
187
188 // Declare a couple of structs that will be used to hold views of the data
189 // of R, A and P, to be used for fast access during the matrix-multiplication.
193
197
198#ifdef HAVE_TPETRA_MMM_TIMINGS
199 MM = Teuchos::null;
200 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All I&X"))));
201#endif
202
203 // Now import any needed remote rows and populate the Aview struct
204 // NOTE: We assert that an import isn't needed --- since we do the transpose
205 // above to handle that.
207
208 if (!(transposeR && &R == &P))
209 MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter, true, label, params);
210
211 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
212
213 // We will also need local access to all rows of P that correspond to the
214 // column-map of op(A).
215 if (numProcs > 1)
216 targetMap_P = Aprime->getColMap();
217
218 // Import any needed remote rows and populate the Pview struct.
219 MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(), false, label, params);
220
222
223 bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
226 else
227 Actemp = rcp(&Ac, false); // don't allow deallocation
228
229#ifdef HAVE_TPETRA_MMM_TIMINGS
230 MM = Teuchos::null;
231 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Multiply"))));
232#endif
233
234 // Call the appropriate method to perform the actual multiplication.
236 if (transposeR && &R == &P)
237 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
238 else
239 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
240 } else if (call_FillComplete_on_result) {
241 if (transposeR && &R == &P)
242 MMdetails::mult_PT_A_P_reuse(Aview, Pview, *Actemp, label, params);
243 else
244 MMdetails::mult_R_A_P_reuse(Rview, Aview, Pview, *Actemp, label, params);
245 } else {
246 // mfh 27 Sep 2016: Is this the "slow" case? This
247 // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
248 // thread-parallel inserts, but that may take some effort.
249 // CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(Ac);
250
251 // MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
252
253 // #ifdef HAVE_TPETRA_MMM_TIMINGS
254 // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All FillComplete"))));
255 // #endif
256 // if (call_FillComplete_on_result) {
257 // // We'll call FillComplete on the C matrix before we exit, and give it a
258 // // domain-map and a range-map.
259 // // The domain-map will be the domain-map of B, unless
260 // // op(B)==transpose(B), in which case the range-map of B will be used.
261 // // The range-map will be the range-map of A, unless op(A)==transpose(A),
262 // // in which case the domain-map of A will be used.
263 // if (!C.isFillComplete())
264 // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
265 // }
266 // Not implemented
267 if (transposeR && &R == &P)
268 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
269 else
270 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
271 }
272
273 if (needs_final_export) {
274#ifdef HAVE_TPETRA_MMM_TIMINGS
275 MM = Teuchos::null;
276 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP exportAndFillComplete"))));
277#endif
278 Teuchos::ParameterList labelList;
279 labelList.set("Timer Label", label);
280 Teuchos::ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params", false);
281
283 bool isMM = true;
284 bool overrideAllreduce = false;
286 if (!params.is_null()) {
287 Teuchos::ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params", false);
289 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
290 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
292 isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete", false);
293 overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck", false);
294
295 labelList.set("compute global constants", params->get("compute global constants", true));
296 }
297 labelList_subList.set("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count, "Core Count above which the optimized neighbor discovery is used");
298
299 labelList_subList.set("isMatrixMatrix_TransferAndFillComplete", isMM,
300 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
301 labelList_subList.set("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
302
303 export_type exporter = export_type(*Pprime->getGraph()->getImporter());
304 Actemp->exportAndFillComplete(Acprime,
305 exporter,
306 Acprime->getDomainMap(),
307 Acprime->getRangeMap(),
308 rcp(&labelList, false));
309 }
310#ifdef HAVE_TPETRA_MMM_STATISTICS
311 printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label + std::string(" RAP MMM"));
312#endif
313}
314
315} // End namespace TripleMatrixMultiply
316
317namespace MMdetails {
318
319// Kernel method for computing the local portion of Ac = R*A*P
320template <class Scalar,
321 class LocalOrdinal,
322 class GlobalOrdinal,
323 class Node>
324void mult_R_A_P_newmatrix(
325 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
326 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
327 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
328 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
329 const std::string& label,
330 const Teuchos::RCP<Teuchos::ParameterList>& params) {
331 using Teuchos::Array;
332 using Teuchos::ArrayRCP;
333 using Teuchos::ArrayView;
334 using Teuchos::RCP;
335 using Teuchos::rcp;
336
337 // typedef Scalar SC; // unused
338 typedef LocalOrdinal LO;
339 typedef GlobalOrdinal GO;
340 typedef Node NO;
341
342 typedef Import<LO, GO, NO> import_type;
343 typedef Map<LO, GO, NO> map_type;
344
345 // Kokkos typedefs
346 typedef typename map_type::local_map_type local_map_type;
348 typedef typename KCRS::StaticCrsGraphType graph_t;
349 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
350 typedef typename NO::execution_space execution_space;
351 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
352 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
353
354#ifdef HAVE_TPETRA_MMM_TIMINGS
355 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
356 using Teuchos::TimeMonitor;
357 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
358#endif
359 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
360
361 // Build the final importer / column map, hash table lookups for Ac
362 RCP<const import_type> Cimport;
363 RCP<const map_type> Ccolmap;
364 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
365 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
366 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
367 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
368 local_map_type Irowmap_local;
369 if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
370 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
371 local_map_type Icolmap_local;
372 if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
373
374 // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
375 // indices of B, to local column indices of Ac. (B and Ac have the
376 // same number of columns.) The kernel uses this, instead of
377 // copying the entire input matrix B and converting its column
378 // indices to those of C.
379 lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
380
381 if (Pview.importMatrix.is_null()) {
382 // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
383 Cimport = Pimport;
384 Ccolmap = Pview.colMap;
385 const LO colMapSize = static_cast<LO>(Pview.colMap->getLocalNumElements());
386 // Pcol2Ccol is trivial
387 Kokkos::parallel_for(
388 "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
389 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
390 KOKKOS_LAMBDA(const LO i) {
391 Pcol2Ccol(i) = i;
392 });
393 } else {
394 // mfh 27 Sep 2016: P has "remotes," so we need to build the
395 // column Map of C, as well as C's Import object (from its domain
396 // Map to its column Map). C's column Map is the union of the
397 // column Maps of (the local part of) P, and the "remote" part of
398 // P. Ditto for the Import. We have optimized this "setUnion"
399 // operation on Import objects and Maps.
400
401 // Choose the right variant of setUnion
402 if (!Pimport.is_null() && !Iimport.is_null()) {
403 Cimport = Pimport->setUnion(*Iimport);
404 } else if (!Pimport.is_null() && Iimport.is_null()) {
405 Cimport = Pimport->setUnion();
406 } else if (Pimport.is_null() && !Iimport.is_null()) {
407 Cimport = Iimport->setUnion();
408 } else {
409 throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
410 }
411 Ccolmap = Cimport->getTargetMap();
412
413 // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
414 // in general. We should get rid of it in order to reduce
415 // communication costs of sparse matrix-matrix multiply.
416 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
417 std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
418
419 // NOTE: This is not efficient and should be folded into setUnion
420 //
421 // mfh 27 Sep 2016: What the above comment means, is that the
422 // setUnion operation on Import objects could also compute these
423 // local index - to - local index look-up tables.
424 Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
425 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
426 Kokkos::parallel_for(
427 "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement", range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
428 Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
429 });
430 Kokkos::parallel_for(
431 "Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement", range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
432 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
433 });
434 }
435
436 // Replace the column map
437 //
438 // mfh 27 Sep 2016: We do this because C was originally created
439 // without a column Map. Now we have its column Map.
440 Ac.replaceColMap(Ccolmap);
441
442 // mfh 27 Sep 2016: Construct tables that map from local column
443 // indices of A, to local row indices of either B_local (the locally
444 // owned part of B), or B_remote (the "imported" remote part of B).
445 //
446 // For column index Aik in row i of A, if the corresponding row of B
447 // exists in the local part of B ("orig") (which I'll call B_local),
448 // then targetMapToOrigRow[Aik] is the local index of that row of B.
449 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
450 //
451 // For column index Aik in row i of A, if the corresponding row of B
452 // exists in the remote part of B ("Import") (which I'll call
453 // B_remote), then targetMapToImportRow[Aik] is the local index of
454 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
455 // (a flag value).
456
457 // Run through all the hash table lookups once and for all
458 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
459 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
460 Kokkos::parallel_for(
461 "Tpetra::mult_R_A_P_newmatrix::construct_tables", range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
462 GO aidx = Acolmap_local.getGlobalElement(i);
463 LO P_LID = Prowmap_local.getLocalElement(aidx);
464 if (P_LID != LO_INVALID) {
465 targetMapToOrigRow(i) = P_LID;
466 targetMapToImportRow(i) = LO_INVALID;
467 } else {
468 LO I_LID = Irowmap_local.getLocalElement(aidx);
469 targetMapToOrigRow(i) = LO_INVALID;
470 targetMapToImportRow(i) = I_LID;
471 }
472 });
473
474 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
475 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
476 KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
477 mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
478 targetMapToOrigRow, targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
479 Ac, Cimport, label, params);
480}
481
482// Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
483template <class Scalar,
484 class LocalOrdinal,
485 class GlobalOrdinal,
486 class Node>
487void mult_R_A_P_reuse(
488 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
489 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
490 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
491 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
492 const std::string& label,
493 const Teuchos::RCP<Teuchos::ParameterList>& params) {
494 using Teuchos::Array;
495 using Teuchos::ArrayRCP;
496 using Teuchos::ArrayView;
497 using Teuchos::RCP;
498 using Teuchos::rcp;
499
500 // typedef Scalar SC; // unused
501 typedef LocalOrdinal LO;
502 typedef GlobalOrdinal GO;
503 typedef Node NO;
504
505 typedef Import<LO, GO, NO> import_type;
506 typedef Map<LO, GO, NO> map_type;
507
508 // Kokkos typedefs
509 typedef typename map_type::local_map_type local_map_type;
511 typedef typename KCRS::StaticCrsGraphType graph_t;
512 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
513 typedef typename NO::execution_space execution_space;
514 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
515 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
516
517#ifdef HAVE_TPETRA_MMM_TIMINGS
518 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
519 using Teuchos::TimeMonitor;
520 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
521#endif
522 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
523
524 // Build the final importer / column map, hash table lookups for Ac
525 RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
526 RCP<const map_type> Ccolmap = Ac.getColMap();
527 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
528 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
529 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
530 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
531 local_map_type Irowmap_local;
532 if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
533 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
534 local_map_type Icolmap_local;
535 if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
536 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
537
538 // Build the final importer / column map, hash table lookups for C
539 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
540 {
541 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
542 // So, column map of C may be a strict subset of the column map of B
543 Kokkos::parallel_for(
544 range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
545 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
546 });
547
548 if (!Pview.importMatrix.is_null()) {
549 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
550 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
551
552 Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
553 Kokkos::parallel_for(
554 range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
555 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
556 });
557 }
558 }
559
560 // Run through all the hash table lookups once and for all
561 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
562 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
563 Kokkos::parallel_for(
564 range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
565 GO aidx = Acolmap_local.getGlobalElement(i);
566 LO B_LID = Prowmap_local.getLocalElement(aidx);
567 if (B_LID != LO_INVALID) {
568 targetMapToOrigRow(i) = B_LID;
569 targetMapToImportRow(i) = LO_INVALID;
570 } else {
571 LO I_LID = Irowmap_local.getLocalElement(aidx);
572 targetMapToOrigRow(i) = LO_INVALID;
573 targetMapToImportRow(i) = I_LID;
574 }
575 });
576
577 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
578 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
579 KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
580 mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview,
581 targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
582 Ac, Cimport, label, params);
583}
584
585// Kernel method for computing the local portion of Ac = R*A*P
586template <class Scalar,
587 class LocalOrdinal,
588 class GlobalOrdinal,
589 class Node>
590void mult_PT_A_P_newmatrix(
591 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
592 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
593 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
594 const std::string& label,
595 const Teuchos::RCP<Teuchos::ParameterList>& params) {
596 using Teuchos::Array;
597 using Teuchos::ArrayRCP;
598 using Teuchos::ArrayView;
599 using Teuchos::RCP;
600 using Teuchos::rcp;
601
602 // typedef Scalar SC; // unused
603 typedef LocalOrdinal LO;
604 typedef GlobalOrdinal GO;
605 typedef Node NO;
606
607 typedef Import<LO, GO, NO> import_type;
608 typedef Map<LO, GO, NO> map_type;
609
610 // Kokkos typedefs
611 typedef typename map_type::local_map_type local_map_type;
613 typedef typename KCRS::StaticCrsGraphType graph_t;
614 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
615 typedef typename NO::execution_space execution_space;
616 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
617 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
618
619#ifdef HAVE_TPETRA_MMM_TIMINGS
620 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
621 using Teuchos::TimeMonitor;
622 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP M5 Cmap")))));
623#endif
624 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
625
626 // Build the final importer / column map, hash table lookups for Ac
627 RCP<const import_type> Cimport;
628 RCP<const map_type> Ccolmap;
629 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
630 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
631 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
632 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
633 local_map_type Irowmap_local;
634 if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
635 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
636 local_map_type Icolmap_local;
637 if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
638
639 // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
640 // indices of B, to local column indices of Ac. (B and Ac have the
641 // same number of columns.) The kernel uses this, instead of
642 // copying the entire input matrix B and converting its column
643 // indices to those of C.
644 lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
645
646 if (Pview.importMatrix.is_null()) {
647 // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
648 Cimport = Pimport;
649 Ccolmap = Pview.colMap;
650 const LO colMapSize = static_cast<LO>(Pview.colMap->getLocalNumElements());
651 // Pcol2Ccol is trivial
652 Kokkos::parallel_for(
653 "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
654 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
655 KOKKOS_LAMBDA(const LO i) {
656 Pcol2Ccol(i) = i;
657 });
658 } else {
659 // mfh 27 Sep 2016: P has "remotes," so we need to build the
660 // column Map of C, as well as C's Import object (from its domain
661 // Map to its column Map). C's column Map is the union of the
662 // column Maps of (the local part of) P, and the "remote" part of
663 // P. Ditto for the Import. We have optimized this "setUnion"
664 // operation on Import objects and Maps.
665
666 // Choose the right variant of setUnion
667 if (!Pimport.is_null() && !Iimport.is_null()) {
668 Cimport = Pimport->setUnion(*Iimport);
669 } else if (!Pimport.is_null() && Iimport.is_null()) {
670 Cimport = Pimport->setUnion();
671 } else if (Pimport.is_null() && !Iimport.is_null()) {
672 Cimport = Iimport->setUnion();
673 } else {
674 throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
675 }
676 Ccolmap = Cimport->getTargetMap();
677
678 // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
679 // in general. We should get rid of it in order to reduce
680 // communication costs of sparse matrix-matrix multiply.
681 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
682 std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
683
684 // NOTE: This is not efficient and should be folded into setUnion
685 //
686 // mfh 27 Sep 2016: What the above comment means, is that the
687 // setUnion operation on Import objects could also compute these
688 // local index - to - local index look-up tables.
689 Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
690 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
691 Kokkos::parallel_for(
692 "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement", range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
693 Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
694 });
695 Kokkos::parallel_for(
696 "Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement", range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
697 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
698 });
699 }
700
701 // Replace the column map
702 //
703 // mfh 27 Sep 2016: We do this because C was originally created
704 // without a column Map. Now we have its column Map.
705 Ac.replaceColMap(Ccolmap);
706
707 // mfh 27 Sep 2016: Construct tables that map from local column
708 // indices of A, to local row indices of either B_local (the locally
709 // owned part of B), or B_remote (the "imported" remote part of B).
710 //
711 // For column index Aik in row i of A, if the corresponding row of B
712 // exists in the local part of B ("orig") (which I'll call B_local),
713 // then targetMapToOrigRow[Aik] is the local index of that row of B.
714 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
715 //
716 // For column index Aik in row i of A, if the corresponding row of B
717 // exists in the remote part of B ("Import") (which I'll call
718 // B_remote), then targetMapToImportRow[Aik] is the local index of
719 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
720 // (a flag value).
721
722 // Run through all the hash table lookups once and for all
723 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
724 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
725
726 Kokkos::parallel_for(
727 "Tpetra::mult_R_A_P_newmatrix::construct_tables", range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
728 GO aidx = Acolmap_local.getGlobalElement(i);
729 LO P_LID = Prowmap_local.getLocalElement(aidx);
730 if (P_LID != LO_INVALID) {
731 targetMapToOrigRow(i) = P_LID;
732 targetMapToImportRow(i) = LO_INVALID;
733 } else {
734 LO I_LID = Irowmap_local.getLocalElement(aidx);
735 targetMapToOrigRow(i) = LO_INVALID;
736 targetMapToImportRow(i) = I_LID;
737 }
738 });
739
740 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
741 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
742 KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
743 mult_PT_A_P_newmatrix_kernel_wrapper(Aview, Pview,
744 targetMapToOrigRow, targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
745 Ac, Cimport, label, params);
746}
747
748// Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
749template <class Scalar,
750 class LocalOrdinal,
751 class GlobalOrdinal,
752 class Node>
753void mult_PT_A_P_reuse(
754 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
755 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
756 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
757 const std::string& label,
758 const Teuchos::RCP<Teuchos::ParameterList>& params) {
759 using Teuchos::Array;
760 using Teuchos::ArrayRCP;
761 using Teuchos::ArrayView;
762 using Teuchos::RCP;
763 using Teuchos::rcp;
764
765 // typedef Scalar SC; // unused
766 typedef LocalOrdinal LO;
767 typedef GlobalOrdinal GO;
768 typedef Node NO;
769
770 typedef Import<LO, GO, NO> import_type;
771 typedef Map<LO, GO, NO> map_type;
772
773 // Kokkos typedefs
774 typedef typename map_type::local_map_type local_map_type;
776 typedef typename KCRS::StaticCrsGraphType graph_t;
777 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
778 typedef typename NO::execution_space execution_space;
779 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
780 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
781
782#ifdef HAVE_TPETRA_MMM_TIMINGS
783 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
784 using Teuchos::TimeMonitor;
785 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
786#endif
787 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
788
789 // Build the final importer / column map, hash table lookups for Ac
790 RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
791 RCP<const map_type> Ccolmap = Ac.getColMap();
792 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
793 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
794 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
795 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
796 local_map_type Irowmap_local;
797 if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
798 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
799 local_map_type Icolmap_local;
800 if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
801 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
802
803 // Build the final importer / column map, hash table lookups for C
804 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
805 {
806 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
807 // So, column map of C may be a strict subset of the column map of B
808 Kokkos::parallel_for(
809 range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
810 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
811 });
812
813 if (!Pview.importMatrix.is_null()) {
814 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
815 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
816
817 Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
818 Kokkos::parallel_for(
819 range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
820 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
821 });
822 }
823 }
824
825 // Run through all the hash table lookups once and for all
826 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
827 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
828 Kokkos::parallel_for(
829 range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
830 GO aidx = Acolmap_local.getGlobalElement(i);
831 LO B_LID = Prowmap_local.getLocalElement(aidx);
832 if (B_LID != LO_INVALID) {
833 targetMapToOrigRow(i) = B_LID;
834 targetMapToImportRow(i) = LO_INVALID;
835 } else {
836 LO I_LID = Irowmap_local.getLocalElement(aidx);
837 targetMapToOrigRow(i) = LO_INVALID;
838 targetMapToImportRow(i) = I_LID;
839 }
840 });
841
842 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
843 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
844 KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
845 mult_PT_A_P_reuse_kernel_wrapper(Aview, Pview,
846 targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
847 Ac, Cimport, label, params);
848}
849
850/*********************************************************************************************************/
851// RAP NewMatrix Kernel wrappers (Default non-threaded version)
852// Computes R * A * P -> Ac using classic Gustavson approach
853template <class Scalar,
854 class LocalOrdinal,
855 class GlobalOrdinal,
856 class Node,
857 class LocalOrdinalViewType>
858void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
859 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
860 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
861 const LocalOrdinalViewType& Acol2Prow_dev,
862 const LocalOrdinalViewType& Acol2PIrow_dev,
863 const LocalOrdinalViewType& Pcol2Accol_dev,
864 const LocalOrdinalViewType& PIcol2Accol_dev,
865 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
866 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
867 const std::string& label,
868 const Teuchos::RCP<Teuchos::ParameterList>& params) {
869#ifdef HAVE_TPETRA_MMM_TIMINGS
870 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
871 using Teuchos::TimeMonitor;
872 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore"))));
873#endif
874
875 using Teuchos::Array;
876 using Teuchos::ArrayRCP;
877 using Teuchos::ArrayView;
878 using Teuchos::RCP;
879 using Teuchos::rcp;
880
881 // Lots and lots of typedefs
883 typedef typename KCRS::StaticCrsGraphType graph_t;
884 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
885 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
886 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
887 typedef typename KCRS::values_type::non_const_type scalar_view_t;
888
889 typedef Scalar SC;
890 typedef LocalOrdinal LO;
891 typedef GlobalOrdinal GO;
892 typedef Node NO;
893 typedef Map<LO, GO, NO> map_type;
894 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
895 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
896 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
897
898 // Sizes
899 RCP<const map_type> Accolmap = Ac.getColMap();
900 size_t m = Rview.origMatrix->getLocalNumRows();
901 size_t n = Accolmap->getLocalNumElements();
902 size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
903
904 // Routine runs on host; have to put arguments on host, too
905 auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
906 Acol2Prow_dev);
907 auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
908 Acol2PIrow_dev);
909 auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
910 Pcol2Accol_dev);
911 auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
912 PIcol2Accol_dev);
913
914 // Grab the Kokkos::SparseCrsMatrices & inner stuff
915 const auto Amat = Aview.origMatrix->getLocalMatrixHost();
916 const auto Pmat = Pview.origMatrix->getLocalMatrixHost();
917 const auto Rmat = Rview.origMatrix->getLocalMatrixHost();
918
919 auto Arowptr = Amat.graph.row_map;
920 auto Prowptr = Pmat.graph.row_map;
921 auto Rrowptr = Rmat.graph.row_map;
922 const auto Acolind = Amat.graph.entries;
923 const auto Pcolind = Pmat.graph.entries;
924 const auto Rcolind = Rmat.graph.entries;
925 const auto Avals = Amat.values;
926 const auto Pvals = Pmat.values;
927 const auto Rvals = Rmat.values;
928
929 typename c_lno_view_t::host_mirror_type::const_type Irowptr;
930 typename lno_nnz_view_t::host_mirror_type Icolind;
931 typename scalar_view_t::host_mirror_type Ivals;
932 if (!Pview.importMatrix.is_null()) {
933 auto lclP = Pview.importMatrix->getLocalMatrixHost();
934 Irowptr = lclP.graph.row_map;
935 Icolind = lclP.graph.entries;
936 Ivals = lclP.values;
937 p_max_nnz_per_row = std::max(p_max_nnz_per_row, Pview.importMatrix->getLocalMaxNumRowEntries());
938 }
939
940#ifdef HAVE_TPETRA_MMM_TIMINGS
941 RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore - Compare"))));
942#endif
943
944 // Classic csr assembly (low memory edition)
945 //
946 // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
947 // The method loops over rows of R, and may resize after processing
948 // each row. Chris Siefert says that this reflects experience in
949 // ML; for the non-threaded case, ML found it faster to spend less
950 // effort on estimation and risk an occasional reallocation.
951 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
952 typename lno_view_t::host_mirror_type Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"), m + 1);
953 typename lno_nnz_view_t::host_mirror_type Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"), CSR_alloc);
954 typename scalar_view_t::host_mirror_type Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"), CSR_alloc);
955
956 // mfh 27 Sep 2016: The ac_status array is an implementation detail
957 // of the local sparse matrix-matrix multiply routine.
958
959 // The status array will contain the index into colind where this entry was last deposited.
960 // ac_status[i] < nnz - not in the row yet
961 // ac_status[i] >= nnz - this is the entry where you can find the data
962 // We start with this filled with INVALID's indicating that there are no entries yet.
963 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
964 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
965 Array<size_t> ac_status(n, ST_INVALID);
966
967 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
968 // routine. The routine computes Ac := R * A * (P_local + P_remote).
969 //
970 // For column index Aik in row i of A, Acol2Prow[Aik] tells
971 // you whether the corresponding row of P belongs to P_local
972 // ("orig") or P_remote ("Import").
973
974 // For each row of R
975 size_t nnz = 0, nnz_old = 0;
976 for (size_t i = 0; i < m; i++) {
977 // mfh 27 Sep 2016: m is the number of rows in the input matrix R
978 // on the calling process.
979 Crowptr[i] = nnz;
980
981 // mfh 27 Sep 2016: For each entry of R in the current row of R
982 for (size_t kk = Rrowptr[i]; kk < Rrowptr[i + 1]; kk++) {
983 LO k = Rcolind[kk]; // local column index of current entry of R
984 const SC Rik = Rvals[kk]; // value of current entry of R
985 if (Rik == SC_ZERO)
986 continue; // skip explicitly stored zero values in R
987 // For each entry of A in the current row of A
988 for (size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
989 LO l = Acolind[ll]; // local column index of current entry of A
990 const SC Akl = Avals[ll]; // value of current entry of A
991 if (Akl == SC_ZERO)
992 continue; // skip explicitly stored zero values in A
993
994 if (Acol2Prow[l] != LO_INVALID) {
995 // mfh 27 Sep 2016: If the entry of Acol2Prow
996 // corresponding to the current entry of A is populated, then
997 // the corresponding row of P is in P_local (i.e., it lives on
998 // the calling process).
999
1000 // Local matrix
1001 size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1002
1003 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1004 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
1005 LO j = Pcolind[jj];
1006 LO Acj = Pcol2Accol[j];
1007 SC Plj = Pvals[jj];
1008
1009 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1010#ifdef HAVE_TPETRA_DEBUG
1011 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1012 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1013 std::runtime_error,
1014 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1015#endif
1016 // New entry
1017 ac_status[Acj] = nnz;
1018 Ccolind[nnz] = Acj;
1019 Cvals[nnz] = Rik * Akl * Plj;
1020 nnz++;
1021 } else {
1022 Cvals[ac_status[Acj]] += Rik * Akl * Plj;
1023 }
1024 }
1025 } else {
1026 // mfh 27 Sep 2016: If the entry of Acol2PRow
1027 // corresponding to the current entry of A is NOT populated (has
1028 // a flag "invalid" value), then the corresponding row of P is
1029 // in P_remote (i.e., it does not live on the calling process).
1030
1031 // Remote matrix
1032 size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1033 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
1034 LO j = Icolind[jj];
1035 LO Acj = PIcol2Accol[j];
1036 SC Plj = Ivals[jj];
1037
1038 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1039#ifdef HAVE_TPETRA_DEBUG
1040 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1041 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1042 std::runtime_error,
1043 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1044#endif
1045 // New entry
1046 ac_status[Acj] = nnz;
1047 Ccolind[nnz] = Acj;
1048 Cvals[nnz] = Rik * Akl * Plj;
1049 nnz++;
1050 } else {
1051 Cvals[ac_status[Acj]] += Rik * Akl * Plj;
1052 }
1053 }
1054 }
1055 }
1056 }
1057 // Resize for next pass if needed
1058 if (nnz + n > CSR_alloc) {
1059 CSR_alloc *= 2;
1060 Kokkos::resize(Ccolind, CSR_alloc);
1061 Kokkos::resize(Cvals, CSR_alloc);
1062 }
1063 nnz_old = nnz;
1064 }
1065
1066 Crowptr[m] = nnz;
1067
1068 // Downward resize
1069 Kokkos::resize(Ccolind, nnz);
1070 Kokkos::resize(Cvals, nnz);
1071
1072#ifdef HAVE_TPETRA_MMM_TIMINGS
1073 MM = Teuchos::null;
1074 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1075#endif
1076 auto Crowptr_dev = Kokkos::create_mirror_view_and_copy(
1077 typename KCRS::device_type(), Crowptr);
1078 auto Ccolind_dev = Kokkos::create_mirror_view_and_copy(
1079 typename KCRS::device_type(), Ccolind);
1080 auto Cvals_dev = Kokkos::create_mirror_view_and_copy(
1081 typename KCRS::device_type(), Cvals);
1082
1083 // Final sort & set of CRS arrays
1084 if (params.is_null() || params->get("sort entries", true))
1085 Import_Util::sortCrsEntries(Crowptr_dev, Ccolind_dev, Cvals_dev);
1086 Ac.setAllValues(Crowptr_dev, Ccolind_dev, Cvals_dev);
1087
1088#ifdef HAVE_TPETRA_MMM_TIMINGS
1089 MM = Teuchos::null;
1090 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1091#endif
1092
1093 // Final FillComplete
1094 //
1095 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1096 // Import (from domain Map to column Map) construction (which costs
1097 // lots of communication) by taking the previously constructed
1098 // Import object. We should be able to do this without interfering
1099 // with the implementation of the local part of sparse matrix-matrix
1100 // multply above.
1101 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1102 labelList->set("Timer Label", label);
1103 if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
1104 RCP<const Export<LO, GO, NO> > dummyExport;
1105 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1106 Rview.origMatrix->getRangeMap(),
1107 Acimport,
1108 dummyExport,
1109 labelList);
1110}
1111
1112/*********************************************************************************************************/
1113// RAP Reuse Kernel wrappers (Default non-threaded version)
1114// Computes R * A * P -> Ac using reuse Gustavson
1115template <class Scalar,
1116 class LocalOrdinal,
1117 class GlobalOrdinal,
1118 class Node,
1119 class LocalOrdinalViewType>
1120void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
1121 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1122 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1123 const LocalOrdinalViewType& Acol2Prow_dev,
1124 const LocalOrdinalViewType& Acol2PIrow_dev,
1125 const LocalOrdinalViewType& Pcol2Accol_dev,
1126 const LocalOrdinalViewType& PIcol2Accol_dev,
1127 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1128 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
1129 const std::string& label,
1130 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1131#ifdef HAVE_TPETRA_MMM_TIMINGS
1132 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1133 using Teuchos::TimeMonitor;
1134 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore"))));
1135#endif
1136
1137 using Teuchos::Array;
1138 using Teuchos::ArrayRCP;
1139 using Teuchos::ArrayView;
1140 using Teuchos::RCP;
1141 using Teuchos::rcp;
1142
1143 // Lots and lots of typedefs
1144 typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
1145 typedef typename KCRS::StaticCrsGraphType graph_t;
1146 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1147 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1148 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1149
1150 typedef Scalar SC;
1151 typedef LocalOrdinal LO;
1152 typedef GlobalOrdinal GO;
1153 typedef Node NO;
1154 typedef Map<LO, GO, NO> map_type;
1155 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1156 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1157 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1158
1159 // Sizes
1160 RCP<const map_type> Accolmap = Ac.getColMap();
1161 size_t m = Rview.origMatrix->getLocalNumRows();
1162 size_t n = Accolmap->getLocalNumElements();
1163 size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
1164
1165 // Routine runs on host; have to put arguments on host, too
1166 auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1167 Acol2Prow_dev);
1168 auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1169 Acol2PIrow_dev);
1170 auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1171 Pcol2Accol_dev);
1172 auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1173 PIcol2Accol_dev);
1174
1175 // Grab the Kokkos::SparseCrsMatrices & inner stuff
1176 const KCRS& Amat = Aview.origMatrix->getLocalMatrixHost();
1177 const KCRS& Pmat = Pview.origMatrix->getLocalMatrixHost();
1178 const KCRS& Rmat = Rview.origMatrix->getLocalMatrixHost();
1179 const KCRS& Cmat = Ac.getLocalMatrixHost();
1180
1181 c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map, Crowptr = Cmat.graph.row_map;
1182 const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
1183 const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
1184 scalar_view_t Cvals = Cmat.values;
1185
1186 c_lno_view_t Irowptr;
1187 lno_nnz_view_t Icolind;
1188 scalar_view_t Ivals;
1189 if (!Pview.importMatrix.is_null()) {
1190 auto lclP = Pview.importMatrix->getLocalMatrixHost();
1191 Irowptr = lclP.graph.row_map;
1192 Icolind = lclP.graph.entries;
1193 Ivals = lclP.values;
1194 p_max_nnz_per_row = std::max(p_max_nnz_per_row, Pview.importMatrix->getLocalMaxNumRowEntries());
1195 }
1196
1197#ifdef HAVE_TPETRA_MMM_TIMINGS
1198 RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore - Compare"))));
1199#endif
1200
1201 // mfh 27 Sep 2016: The ac_status array is an implementation detail
1202 // of the local sparse matrix-matrix multiply routine.
1203
1204 // The status array will contain the index into colind where this entry was last deposited.
1205 // ac_status[i] < nnz - not in the row yet
1206 // ac_status[i] >= nnz - this is the entry where you can find the data
1207 // We start with this filled with INVALID's indicating that there are no entries yet.
1208 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1209 Array<size_t> ac_status(n, ST_INVALID);
1210
1211 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1212 // routine. The routine computes Ac := R * A * (P_local + P_remote).
1213 //
1214 // For column index Aik in row i of A, Acol2Prow[Aik] tells
1215 // you whether the corresponding row of P belongs to P_local
1216 // ("orig") or P_remote ("Import").
1217
1218 // Necessary until following UVM host accesses are changed - for example Crowptr
1219 // Also probably needed in mult_R_A_P_newmatrix_kernel_wrapper - did not demonstrate this in test failure yet
1220 Kokkos::fence();
1221
1222 // For each row of R
1223 size_t OLD_ip = 0, CSR_ip = 0;
1224 for (size_t i = 0; i < m; i++) {
1225 // First fill the c_status array w/ locations where we're allowed to
1226 // generate nonzeros for this row
1227 OLD_ip = Crowptr[i];
1228 CSR_ip = Crowptr[i + 1];
1229 for (size_t k = OLD_ip; k < CSR_ip; k++) {
1230 ac_status[Ccolind[k]] = k;
1231
1232 // Reset values in the row of C
1233 Cvals[k] = SC_ZERO;
1234 }
1235
1236 // mfh 27 Sep 2016: For each entry of R in the current row of R
1237 for (size_t kk = Rrowptr[i]; kk < Rrowptr[i + 1]; kk++) {
1238 LO k = Rcolind[kk]; // local column index of current entry of R
1239 const SC Rik = Rvals[kk]; // value of current entry of R
1240 if (Rik == SC_ZERO)
1241 continue; // skip explicitly stored zero values in R
1242 // For each entry of A in the current row of A
1243 for (size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
1244 LO l = Acolind[ll]; // local column index of current entry of A
1245 const SC Akl = Avals[ll]; // value of current entry of A
1246 if (Akl == SC_ZERO)
1247 continue; // skip explicitly stored zero values in A
1248
1249 if (Acol2Prow[l] != LO_INVALID) {
1250 // mfh 27 Sep 2016: If the entry of Acol2Prow
1251 // corresponding to the current entry of A is populated, then
1252 // the corresponding row of P is in P_local (i.e., it lives on
1253 // the calling process).
1254
1255 // Local matrix
1256 size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1257
1258 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1259 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
1260 LO j = Pcolind[jj];
1261 LO Cij = Pcol2Accol[j];
1262 SC Plj = Pvals[jj];
1263
1264 TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1265 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
1266 << "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1267
1268 Cvals[ac_status[Cij]] += Rik * Akl * Plj;
1269 }
1270 } else {
1271 // mfh 27 Sep 2016: If the entry of Acol2PRow
1272 // corresponding to the current entry of A is NOT populated (has
1273 // a flag "invalid" value), then the corresponding row of P is
1274 // in P_remote (i.e., it does not live on the calling process).
1275
1276 // Remote matrix
1277 size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1278 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
1279 LO j = Icolind[jj];
1280 LO Cij = PIcol2Accol[j];
1281 SC Plj = Ivals[jj];
1282
1283 TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1284 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
1285 << "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1286
1287 Cvals[ac_status[Cij]] += Rik * Akl * Plj;
1288 }
1289 }
1290 }
1291 }
1292 }
1293
1294#ifdef HAVE_TPETRA_MMM_TIMINGS
1295 auto MM3 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse ESFC"))));
1296#endif
1297
1298 Ac.fillComplete(Ac.getDomainMap(), Ac.getRangeMap());
1299}
1300
1301/*********************************************************************************************************/
1302// PT_A_P NewMatrix Kernel wrappers (Default, general, non-threaded version)
1303// Computes P.T * A * P -> Ac
1304template <class Scalar,
1305 class LocalOrdinal,
1306 class GlobalOrdinal,
1307 class Node,
1308 class LocalOrdinalViewType>
1309void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1310 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1311 const LocalOrdinalViewType& Acol2Prow,
1312 const LocalOrdinalViewType& Acol2PIrow,
1313 const LocalOrdinalViewType& Pcol2Accol,
1314 const LocalOrdinalViewType& PIcol2Accol,
1315 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1316 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
1317 const std::string& label,
1318 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1319#ifdef HAVE_TPETRA_MMM_TIMINGS
1320 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1321 using Teuchos::TimeMonitor;
1322 Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1323#endif
1324
1325 // We don't need a kernel-level PTAP, we just transpose here
1326 typedef RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer_type;
1327 transposer_type transposer(Pview.origMatrix, label + std::string("XP: "));
1328
1329 using Teuchos::ParameterList;
1330 using Teuchos::RCP;
1331 RCP<ParameterList> transposeParams(new ParameterList);
1332 transposeParams->set("sort", false);
1333
1334 if (!params.is_null()) {
1335 transposeParams->set("compute global constants",
1336 params->get("compute global constants: temporaries",
1337 false));
1338 }
1339 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1340 transposer.createTransposeLocal(transposeParams);
1341 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1342 Rview.origMatrix = Ptrans;
1343
1344 mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview, Acol2Prow, Acol2PIrow, Pcol2Accol, PIcol2Accol, Ac, Acimport, label, params);
1345}
1346
1347/*********************************************************************************************************/
1348// PT_A_P Reuse Kernel wrappers (Default, general, non-threaded version)
1349// Computes P.T * A * P -> Ac
1350template <class Scalar,
1351 class LocalOrdinal,
1352 class GlobalOrdinal,
1353 class Node,
1354 class LocalOrdinalViewType>
1355void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_PT_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1356 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1357 const LocalOrdinalViewType& Acol2Prow,
1358 const LocalOrdinalViewType& Acol2PIrow,
1359 const LocalOrdinalViewType& Pcol2Accol,
1360 const LocalOrdinalViewType& PIcol2Accol,
1361 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1362 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
1363 const std::string& label,
1364 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1365#ifdef HAVE_TPETRA_MMM_TIMINGS
1366 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1367 using Teuchos::TimeMonitor;
1368 Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1369#endif
1370
1371 // We don't need a kernel-level PTAP, we just transpose here
1372 typedef RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer_type;
1373 transposer_type transposer(Pview.origMatrix, label + std::string("XP: "));
1374
1375 using Teuchos::ParameterList;
1376 using Teuchos::RCP;
1377 RCP<ParameterList> transposeParams(new ParameterList);
1378 transposeParams->set("sort", false);
1379
1380 if (!params.is_null()) {
1381 transposeParams->set("compute global constants",
1382 params->get("compute global constants: temporaries",
1383 false));
1384 }
1385 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1386 transposer.createTransposeLocal(transposeParams);
1387 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1388 Rview.origMatrix = Ptrans;
1389
1390 mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview, Acol2Prow, Acol2PIrow, Pcol2Accol, PIcol2Accol, Ac, Acimport, label, params);
1391}
1392
1393/*********************************************************************************************************/
1394// PT_A_P NewMatrix Kernel wrappers (Default non-threaded version)
1395// Computes P.T * A * P -> Ac using a 2-pass algorithm.
1396// This turned out to be slower on SerialNode, but it might still be helpful when going to Kokkos, so I left it in.
1397// Currently, this implementation never gets called.
1398template <class Scalar,
1399 class LocalOrdinal,
1400 class GlobalOrdinal,
1401 class Node>
1402void KernelWrappers3MMM<Scalar, LocalOrdinal, GlobalOrdinal, Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1403 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1404 const Teuchos::Array<LocalOrdinal>& Acol2PRow,
1405 const Teuchos::Array<LocalOrdinal>& Acol2PRowImport,
1406 const Teuchos::Array<LocalOrdinal>& Pcol2Accol,
1407 const Teuchos::Array<LocalOrdinal>& PIcol2Accol,
1408 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1409 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
1410 const std::string& label,
1411 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1412#ifdef HAVE_TPETRA_MMM_TIMINGS
1413 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1414 using Teuchos::TimeMonitor;
1415 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix SerialCore"))));
1416#endif
1417
1418 using Teuchos::Array;
1419 using Teuchos::ArrayRCP;
1420 using Teuchos::ArrayView;
1421 using Teuchos::RCP;
1422 using Teuchos::rcp;
1423
1424 typedef Scalar SC;
1425 typedef LocalOrdinal LO;
1426 typedef GlobalOrdinal GO;
1427 typedef Node NO;
1428 typedef RowMatrixTransposer<SC, LO, GO, NO> transposer_type;
1429 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1430 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1431
1432 // number of rows on the process of the fine matrix
1433 // size_t m = Pview.origMatrix->getLocalNumRows();
1434 // number of rows on the process of the coarse matrix
1435 size_t n = Ac.getRowMap()->getLocalNumElements();
1436 LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1437
1438 // Get Data Pointers
1439 ArrayRCP<size_t> Acrowptr_RCP;
1440 ArrayRCP<LO> Accolind_RCP;
1441 ArrayRCP<SC> Acvals_RCP;
1442
1443 // mfh 27 Sep 2016: get the three CSR arrays
1444 // out of the CrsMatrix. This code computes R * A * (P_local +
1445 // P_remote), where P_local contains the locally owned rows of P,
1446 // and P_remote the (previously Import'ed) remote rows of P.
1447
1448 auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1449 auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1450 auto Avals = Aview.origMatrix->getLocalValuesHost(
1451 Tpetra::Access::ReadOnly);
1452 auto Prowptr = Pview.origMatrix->getLocalRowPtrsHost();
1453 auto Pcolind = Pview.origMatrix->getLocalIndicesHost();
1454 auto Pvals = Pview.origMatrix->getLocalValuesHost(
1455 Tpetra::Access::ReadOnly);
1456 decltype(Prowptr) Irowptr;
1457 decltype(Pcolind) Icolind;
1458 decltype(Pvals) Ivals;
1459
1460 if (!Pview.importMatrix.is_null()) {
1461 Irowptr = Pview.importMatrix->getLocalRowPtrsHost();
1462 Icolind = Pview.importMatrix->getLocalIndicesHost();
1463 Ivals = Pview.importMatrix->getLocalValuesHost(
1464 Tpetra::Access::ReadOnly);
1465 }
1466
1467 // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1468 // where Teuchos::ArrayRCP::operator[] may be slower than
1469 // Teuchos::ArrayView::operator[].
1470
1471 // For efficiency
1472 ArrayView<size_t> Acrowptr;
1473 ArrayView<LO> Accolind;
1474 ArrayView<SC> Acvals;
1475
1477 // In a first pass, determine the graph of Ac.
1479
1481 // Get the graph of Ac. This gets the local transpose of P,
1482 // then loops over R, A, P to get the graph of Ac.
1484
1485#ifdef HAVE_TPETRA_MMM_TIMINGS
1486 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1487#endif
1488
1490 // Get the local transpose of the graph of P by locally transposing
1491 // all of P
1492
1493 transposer_type transposer(Pview.origMatrix, label + std::string("XP: "));
1494
1495 using Teuchos::ParameterList;
1496 RCP<ParameterList> transposeParams(new ParameterList);
1497 transposeParams->set("sort", false);
1498 if (!params.is_null()) {
1499 transposeParams->set("compute global constants",
1500 params->get("compute global constants: temporaries",
1501 false));
1502 }
1503 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1504 transposer.createTransposeLocal(transposeParams);
1505
1506 auto Rrowptr = Ptrans->getLocalRowPtrsHost();
1507 auto Rcolind = Ptrans->getLocalIndicesHost();
1508 auto Rvals = Ptrans->getLocalValuesHost(Tpetra::Access::ReadOnly);
1509
1511 // Construct graph
1512
1513#ifdef HAVE_TPETRA_MMM_TIMINGS
1514 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP graph"))));
1515#endif
1516
1517 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1518 Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1519
1520 size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1521 size_t nnzPerRowA = 100;
1522 if (Aview.origMatrix->getLocalNumEntries() > 0)
1523 nnzPerRowA = Aview.origMatrix->getLocalNumEntries() / Aview.origMatrix->getLocalNumRows();
1524 Acrowptr_RCP.resize(n + 1);
1525 Acrowptr = Acrowptr_RCP();
1526 Accolind_RCP.resize(nnz_alloc);
1527 Accolind = Accolind_RCP();
1528
1529 size_t nnz = 0, nnz_old = 0;
1530 for (size_t i = 0; i < n; i++) {
1531 // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1532 // on the calling process.
1533 Acrowptr[i] = nnz;
1534
1535 // mfh 27 Sep 2016: For each entry of R in the current row of R
1536 for (size_t kk = Rrowptr[i]; kk < Rrowptr[i + 1]; kk++) {
1537 LO k = Rcolind[kk]; // local column index of current entry of R
1538 // For each entry of A in the current row of A
1539 for (size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
1540 LO l = Acolind[ll]; // local column index of current entry of A
1541
1542 if (Acol2PRow[l] != LO_INVALID) {
1543 // mfh 27 Sep 2016: If the entry of Acol2PRow
1544 // corresponding to the current entry of A is populated, then
1545 // the corresponding row of P is in P_local (i.e., it lives on
1546 // the calling process).
1547
1548 // Local matrix
1549 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1550
1551 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1552 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
1553 LO j = Pcolind[jj];
1554 LO Acj = Pcol2Accol[j];
1555
1556 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1557 // New entry
1558 ac_status[Acj] = nnz;
1559 Accolind[nnz] = Acj;
1560 nnz++;
1561 }
1562 }
1563 } else {
1564 // mfh 27 Sep 2016: If the entry of Acol2PRow
1565 // corresponding to the current entry of A is NOT populated (has
1566 // a flag "invalid" value), then the corresponding row of P is
1567 // in P_remote (i.e., it does not live on the calling process).
1568
1569 // Remote matrix
1570 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1571 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
1572 LO j = Icolind[jj];
1573 LO Acj = PIcol2Accol[j];
1574
1575 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1576 // New entry
1577 ac_status[Acj] = nnz;
1578 Accolind[nnz] = Acj;
1579 nnz++;
1580 }
1581 }
1582 }
1583 }
1584 }
1585 // Resize for next pass if needed
1586 // cag: Maybe we can do something more subtle here, and not double
1587 // the size right away.
1588 if (nnz + std::max(5 * nnzPerRowA, n) > nnz_alloc) {
1589 nnz_alloc *= 2;
1590 nnz_alloc = std::max(nnz_alloc, nnz + std::max(5 * nnzPerRowA, n));
1591 Accolind_RCP.resize(nnz_alloc);
1592 Accolind = Accolind_RCP();
1593 Acvals_RCP.resize(nnz_alloc);
1594 Acvals = Acvals_RCP();
1595 }
1596 nnz_old = nnz;
1597 }
1598 Acrowptr[n] = nnz;
1599
1600 // Downward resize
1601 Accolind_RCP.resize(nnz);
1602 Accolind = Accolind_RCP();
1603
1604 // Allocate Acvals
1605 Acvals_RCP.resize(nnz, SC_ZERO);
1606 Acvals = Acvals_RCP();
1607
1609 // In a second pass, enter the values into Acvals.
1611
1612#ifdef HAVE_TPETRA_MMM_TIMINGS
1613 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Fill Matrix"))));
1614#endif
1615
1616 for (size_t k = 0; k < n; k++) {
1617 for (size_t ii = Prowptr[k]; ii < Prowptr[k + 1]; ii++) {
1618 LO i = Pcolind[ii];
1619 const SC Pki = Pvals[ii];
1620 for (size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
1621 LO l = Acolind[ll];
1622 const SC Akl = Avals[ll];
1623 if (Akl == 0.)
1624 continue;
1625 if (Acol2PRow[l] != LO_INVALID) {
1626 // mfh 27 Sep 2016: If the entry of Acol2PRow
1627 // corresponding to the current entry of A is populated, then
1628 // the corresponding row of P is in P_local (i.e., it lives on
1629 // the calling process).
1630
1631 // Local matrix
1632 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1633 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
1634 LO j = Pcolind[jj];
1635 LO Acj = Pcol2Accol[j];
1636 size_t pp;
1637 for (pp = Acrowptr[i]; pp < Acrowptr[i + 1]; pp++)
1638 if (Accolind[pp] == Acj)
1639 break;
1640 // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1641 // std::runtime_error, "problem with Ac column indices");
1642 Acvals[pp] += Pki * Akl * Pvals[jj];
1643 }
1644 } else {
1645 // mfh 27 Sep 2016: If the entry of Acol2PRow
1646 // corresponding to the current entry of A NOT populated (has
1647 // a flag "invalid" value), then the corresponding row of P is
1648 // in P_remote (i.e., it does not live on the calling process).
1649
1650 // Remote matrix
1651 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1652 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
1653 LO j = Icolind[jj];
1654 LO Acj = PIcol2Accol[j];
1655 size_t pp;
1656 for (pp = Acrowptr[i]; pp < Acrowptr[i + 1]; pp++)
1657 if (Accolind[pp] == Acj)
1658 break;
1659 // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1660 // std::runtime_error, "problem with Ac column indices");
1661 Acvals[pp] += Pki * Akl * Ivals[jj];
1662 }
1663 }
1664 }
1665 }
1666 }
1667
1668#ifdef HAVE_TPETRA_MMM_TIMINGS
1669 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP sort"))));
1670#endif
1671
1672 // Final sort & set of CRS arrays
1673 //
1674 // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
1675 // matrix-matrix multiply routine sort the entries for us?
1676 Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1677
1678 // mfh 27 Sep 2016: This just sets pointers.
1679 Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1680
1681#ifdef HAVE_TPETRA_MMM_TIMINGS
1682 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix ESFC"))));
1683#endif
1684
1685 // Final FillComplete
1686 //
1687 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1688 // Import (from domain Map to column Map) construction (which costs
1689 // lots of communication) by taking the previously constructed
1690 // Import object. We should be able to do this without interfering
1691 // with the implementation of the local part of sparse matrix-matrix
1692 // multply above.
1693 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1694 labelList->set("Timer Label", label);
1695 // labelList->set("Sort column Map ghost GIDs")
1696 if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
1697 RCP<const Export<LO, GO, NO> > dummyExport;
1698 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1699 Pview.origMatrix->getDomainMap(),
1700 Acimport,
1701 dummyExport, labelList);
1702}
1703
1704} // namespace MMdetails
1705
1706} // End namespace Tpetra
1707//
1708// Explicit instantiation macro
1709//
1710// Must be expanded from within the Tpetra namespace!
1711//
1712
1713#define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR, LO, GO, NODE) \
1714 \
1715 template void TripleMatrixMultiply::MultiplyRAP( \
1716 const CrsMatrix<SCALAR, LO, GO, NODE>& R, \
1717 bool transposeR, \
1718 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
1719 bool transposeA, \
1720 const CrsMatrix<SCALAR, LO, GO, NODE>& P, \
1721 bool transposeP, \
1722 CrsMatrix<SCALAR, LO, GO, NODE>& Ac, \
1723 bool call_FillComplete_on_result, \
1724 const std::string& label, \
1725 const Teuchos::RCP<Teuchos::ParameterList>& params);
1726
1727#endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
Namespace Tpetra contains the class and methods constituting the Tpetra library.