Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_MATRIXMATRIX_DEF_HPP
11#define TPETRA_MATRIXMATRIX_DEF_HPP
13#include "KokkosSparse_Utils.hpp"
14#include "Tpetra_ConfigDefs.hpp"
16#include "Teuchos_VerboseObject.hpp"
17#include "Teuchos_Array.hpp"
18#include "Tpetra_Util.hpp"
19#include "Tpetra_CrsMatrix.hpp"
20#include "Tpetra_BlockCrsMatrix.hpp"
22#include "Tpetra_RowMatrixTransposer.hpp"
25#include "Tpetra_Details_makeColMap.hpp"
26#include "Tpetra_ConfigDefs.hpp"
27#include "Tpetra_Map.hpp"
28#include "Tpetra_Export.hpp"
32
33#include <algorithm>
34#include <type_traits>
35#include "Teuchos_FancyOStream.hpp"
36
37#include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
39
40#include "KokkosSparse_spgemm.hpp"
41#include "KokkosSparse_spadd.hpp"
42#include "Kokkos_Bitset.hpp"
43
45
51/*********************************************************************************************************/
52// Include the architecture-specific kernel partial specializations here
53// NOTE: This needs to be outside all namespaces
54#include "TpetraExt_MatrixMatrix_OpenMP.hpp"
55#include "TpetraExt_MatrixMatrix_Cuda.hpp"
56#include "TpetraExt_MatrixMatrix_HIP.hpp"
57#include "TpetraExt_MatrixMatrix_SYCL.hpp"
58
59namespace Tpetra {
60
61namespace MatrixMatrix {
62
63//
64// This method forms the matrix-matrix product C = op(A) * op(B), where
65// op(A) == A if transposeA is false,
66// op(A) == A^T if transposeA is true,
67// and similarly for op(B).
68//
69template <class Scalar,
70 class LocalOrdinal,
71 class GlobalOrdinal,
72 class Node>
75 bool transposeA,
77 bool transposeB,
80 const std::string& label,
81 const Teuchos::RCP<Teuchos::ParameterList>& params) {
82 using Teuchos::null;
83 using Teuchos::RCP;
84 using Teuchos::rcp;
85 typedef Scalar SC;
86 typedef LocalOrdinal LO;
87 typedef GlobalOrdinal GO;
88 typedef Node NO;
89 typedef CrsMatrix<SC, LO, GO, NO> crs_matrix_type;
90 typedef Import<LO, GO, NO> import_type;
92 typedef Map<LO, GO, NO> map_type;
94
96
97 const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
98
99 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
100
101 // The input matrices A and B must both be fillComplete.
102 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
103 TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
104
105 // If transposeA is true, then Aprime will be the transpose of A
106 // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
107 // will just be a pointer to A.
109 // If transposeB is true, then Bprime will be the transpose of B
110 // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
111 // will just be a pointer to B.
113
114 // Is this a "clean" matrix?
115 //
116 // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
117 // locally nor globally indexed, then it was empty. I don't like
118 // this, because the most straightforward implementation presumes
119 // lazy allocation of indices. However, historical precedent
120 // demands that we keep around this predicate as a way to test
121 // whether the matrix is empty.
122 const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
123
124 bool use_optimized_ATB = false;
126 use_optimized_ATB = true;
127
128#ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
129 use_optimized_ATB = false;
130#endif
131
132 using Teuchos::ParameterList;
134 transposeParams->set("sort", true); // Kokkos Kernels spgemm requires inputs to be sorted
135
138 Aprime = transposer.createTranspose(transposeParams);
139 } else {
141 }
142
143 if (transposeB) {
145 Bprime = transposer.createTranspose(transposeParams);
146 } else {
148 }
149
150 // Check size compatibility
151 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
152 global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
153 global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
154 global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
155 global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
156 global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
157 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
158 prefix << "ERROR, inner dimensions of op(A) and op(B) "
159 "must match for matrix-matrix product. op(A) is "
160 << Aouter << "x" << Ainner << ", op(B) is " << Binner << "x" << Bouter);
161
162 // The result matrix C must at least have a row-map that reflects the correct
163 // row-size. Don't check the number of columns because rectangular matrices
164 // which were constructed with only one map can still end up having the
165 // correct capacity and dimensions when filled.
166 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
167 prefix << "ERROR, dimensions of result C must "
168 "match dimensions of op(A) * op(B). C has "
169 << C.getGlobalNumRows()
170 << " rows, should have at least " << Aouter << std::endl);
171
172 // It doesn't matter whether C is already Filled or not. If it is already
173 // Filled, it must have space allocated for the positions that will be
174 // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
175 // we'll error out later when trying to store result values.
176
177 // CGB: However, matrix must be in active-fill
178 if (!C.isFillActive()) C.resumeFill();
179
180 // We're going to need to import remotely-owned sections of A and/or B if
181 // more than one processor is performing this run, depending on the scenario.
182 int numProcs = A.getComm()->getSize();
183
184 // Declare a couple of structs that will be used to hold views of the data
185 // of A and B, to be used for fast access during the matrix-multiplication.
188
191
192 {
193 Tpetra::Details::ProfilingRegion r("TpetraExt: MMM: All I&X");
194
195 // Now import any needed remote rows and populate the Aview struct
196 // NOTE: We assert that an import isn't needed --- since we do the transpose
197 // above to handle that.
198 if (!use_optimized_ATB) {
200 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
201 }
202
203 // We will also need local access to all rows of B that correspond to the
204 // column-map of op(A).
205 if (numProcs > 1)
206 targetMap_B = Aprime->getColMap();
207
208 // Import any needed remote rows and populate the Bview struct.
210 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
211
212 } // stop MM_importExtract here
213
214 // stop the setup timer, and start the multiply timer
215 MM = Teuchos::null;
216 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: All Multiply"));
217
218 // Call the appropriate method to perform the actual multiplication.
219 if (use_optimized_ATB) {
220 MMdetails::mult_AT_B_newmatrix(A, B, C, label, params);
221
222 } else if (call_FillComplete_on_result && newFlag) {
223 MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label, params);
224
225 } else if (call_FillComplete_on_result) {
226 MMdetails::mult_A_B_reuse(Aview, Bview, C, label, params);
227
228 } else {
229 // mfh 27 Sep 2016: Is this the "slow" case? This
230 // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
231 // thread-parallel inserts, but that may take some effort.
233
234 MMdetails::mult_A_B(Aview, Bview, crsmat, label, params);
235 }
236
237 MM = Teuchos::null;
238 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: All FillComplete"));
239 if (call_FillComplete_on_result && !C.isFillComplete()) {
240 // We'll call FillComplete on the C matrix before we exit, and give it a
241 // domain-map and a range-map.
242 // The domain-map will be the domain-map of B, unless
243 // op(B)==transpose(B), in which case the range-map of B will be used.
244 // The range-map will be the range-map of A, unless op(A)==transpose(A),
245 // in which case the domain-map of A will be used.
246 C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
247 }
248}
249
250//
251// This method forms the matrix-matrix product C = op(A) * op(B), where
252// op(A) == A and similarly for op(B). op(A) = A^T is not yet implemented.
253//
254template <class Scalar,
255 class LocalOrdinal,
256 class GlobalOrdinal,
257 class Node>
260 bool transposeA,
262 bool transposeB,
264 const std::string& label) {
265 using Teuchos::null;
266 using Teuchos::RCP;
267 using Teuchos::rcp;
268 typedef Scalar SC;
269 typedef LocalOrdinal LO;
270 typedef GlobalOrdinal GO;
271 typedef Node NO;
273 typedef Map<LO, GO, NO> map_type;
274 typedef Import<LO, GO, NO> import_type;
275
276 std::string prefix = std::string("TpetraExt ") + label + std::string(": ");
277
278 TEUCHOS_TEST_FOR_EXCEPTION(transposeA == true, std::runtime_error, prefix << "Matrix A cannot be transposed.");
279 TEUCHOS_TEST_FOR_EXCEPTION(transposeB == true, std::runtime_error, prefix << "Matrix B cannot be transposed.");
280
281 // Check size compatibility
282 global_size_t numACols = A->getGlobalNumCols();
283 global_size_t numBCols = B->getGlobalNumCols();
284 global_size_t numARows = A->getGlobalNumRows();
285 global_size_t numBRows = B->getGlobalNumRows();
286
291 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
292 prefix << "ERROR, inner dimensions of op(A) and op(B) "
293 "must match for matrix-matrix product. op(A) is "
294 << Aouter << "x" << Ainner << ", op(B) is " << Binner << "x" << Bouter);
295
296 // We're going to need to import remotely-owned sections of A and/or B if
297 // more than one processor is performing this run, depending on the scenario.
298 int numProcs = A->getComm()->getSize();
299
300 const LO blocksize = A->getBlockSize();
301 TEUCHOS_TEST_FOR_EXCEPTION(blocksize != B->getBlockSize(), std::runtime_error,
302 prefix << "ERROR, Blocksizes do not match. A.blocksize = " << blocksize << ", B.blocksize = " << B->getBlockSize());
303
304 // Declare a couple of structs that will be used to hold views of the data
305 // of A and B, to be used for fast access during the matrix-multiplication.
308
309 RCP<const map_type> targetMap_A = A->getRowMap();
310 RCP<const map_type> targetMap_B = B->getRowMap();
311
312 // Populate the Aview struct. No remotes are needed.
314 MMdetails::import_and_extract_views(*A, targetMap_A, Aview, dummyImporter, true);
315
316 // We will also need local access to all rows of B that correspond to the
317 // column-map of op(A).
318 if (numProcs > 1)
319 targetMap_B = A->getColMap();
320
321 // Import any needed remote rows and populate the Bview struct.
322 MMdetails::import_and_extract_views(*B, targetMap_B, Bview, A->getGraph()->getImporter(),
323 A->getGraph()->getImporter().is_null());
324
325 // Call the appropriate method to perform the actual multiplication.
326 MMdetails::mult_A_B_newmatrix(Aview, Bview, C);
327}
328
329template <class Scalar,
330 class LocalOrdinal,
331 class GlobalOrdinal,
332 class Node>
333void Jacobi(typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
339 const std::string& label,
340 const Teuchos::RCP<Teuchos::ParameterList>& params) {
341 using Teuchos::RCP;
342 using Teuchos::rcp;
343 typedef Scalar SC;
344 typedef LocalOrdinal LO;
345 typedef GlobalOrdinal GO;
346 typedef Node NO;
347 typedef Import<LO, GO, NO> import_type;
349 typedef Map<LO, GO, NO> map_type;
350 typedef CrsMatrix<SC, LO, GO, NO> crs_matrix_type;
351
353
354 const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
355
356 // A and B should already be Filled.
357 // Should we go ahead and call FillComplete() on them if necessary or error
358 // out? For now, we choose to error out.
359 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
360 TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
361
364
365 // Now check size compatibility
366 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
367 global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
368 global_size_t Aouter = A.getGlobalNumRows();
371 global_size_t Binner = B.getGlobalNumRows();
372 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
373 prefix << "ERROR, inner dimensions of op(A) and op(B) "
374 "must match for matrix-matrix product. op(A) is "
375 << Aouter << "x" << Ainner << ", op(B) is " << Binner << "x" << Bouter);
376
377 // The result matrix C must at least have a row-map that reflects the correct
378 // row-size. Don't check the number of columns because rectangular matrices
379 // which were constructed with only one map can still end up having the
380 // correct capacity and dimensions when filled.
381 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
382 prefix << "ERROR, dimensions of result C must "
383 "match dimensions of op(A) * op(B). C has "
384 << C.getGlobalNumRows()
385 << " rows, should have at least " << Aouter << std::endl);
386
387 // It doesn't matter whether C is already Filled or not. If it is already
388 // Filled, it must have space allocated for the positions that will be
389 // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
390 // we'll error out later when trying to store result values.
391
392 // CGB: However, matrix must be in active-fill
393 TEUCHOS_TEST_FOR_EXCEPT(C.isFillActive() == false);
394
395 // We're going to need to import remotely-owned sections of A and/or B if
396 // more than one processor is performing this run, depending on the scenario.
397 int numProcs = A.getComm()->getSize();
398
399 // Declare a couple of structs that will be used to hold views of the data of
400 // A and B, to be used for fast access during the matrix-multiplication.
403
406
407 {
408 Tpetra::Details::ProfilingRegion r("TpetraExt: Jacobi: All I&X");
409 // Enable globalConstants by default
410 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
411 RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
412 if (!params.is_null()) {
413 importParams1->set("compute global constants", params->get("compute global constants: temporaries", false));
415 auto slist = params->sublist("matrixmatrix: kernel params", false);
417 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
419 bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete", false);
420 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
421 auto& ip1slist = importParams1->sublist("matrixmatrix: kernel params", false);
422 ip1slist.set("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
423 ip1slist.set("isMatrixMatrix_TransferAndFillComplete", isMM);
424 ip1slist.set("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
425 }
426
427 // Now import any needed remote rows and populate the Aview struct.
429 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, importParams1);
430
431 // We will also need local access to all rows of B that correspond to the
432 // column-map of op(A).
433 if (numProcs > 1)
434 targetMap_B = Aprime->getColMap();
435
436 // Now import any needed remote rows and populate the Bview struct.
437 // Enable globalConstants by default
438 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
439 RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
440 if (!params.is_null()) {
441 importParams2->set("compute global constants", params->get("compute global constants: temporaries", false));
442
443 auto slist = params->sublist("matrixmatrix: kernel params", false);
445 bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete", false);
446 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
447 auto& ip2slist = importParams2->sublist("matrixmatrix: kernel params", false);
448 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
450 ip2slist.set("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
451 ip2slist.set("isMatrixMatrix_TransferAndFillComplete", isMM);
452 ip2slist.set("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
453 }
454
455 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, importParams2);
456 }
457 MM = Teuchos::null;
458 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi All Multiply"));
459
460 // Now call the appropriate method to perform the actual multiplication.
462
463 // Is this a "clean" matrix
464 bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
465
467 MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
468
469 } else if (call_FillComplete_on_result) {
470 MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
471
472 } else {
473 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
474 }
475
476 if (!params.is_null()) {
477 bool removeZeroEntries = params->get("remove zeros", false);
478 if (removeZeroEntries) {
479 typedef Teuchos::ScalarTraits<Scalar> STS;
480 typename STS::magnitudeType threshold = params->get("remove zeros threshold", STS::magnitude(STS::zero()));
482 }
483 }
484}
485
486template <class Scalar,
487 class LocalOrdinal,
488 class GlobalOrdinal,
489 class Node>
490void Add(
492 bool transposeA,
495 Scalar scalarB) {
496 using Teuchos::Array;
497 using Teuchos::null;
498 using Teuchos::RCP;
499 typedef Scalar SC;
500 typedef LocalOrdinal LO;
501 typedef GlobalOrdinal GO;
502 typedef Node NO;
503 typedef CrsMatrix<SC, LO, GO, NO> crs_matrix_type;
505
506 const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
507
508 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
509 prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
510 "(Result matrix B is not required to be isFillComplete()).");
511 TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete(), std::runtime_error,
512 prefix << "ERROR, input matrix B must not be fill complete!");
513 TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph(), std::runtime_error,
514 prefix << "ERROR, input matrix B must not have static graph!");
515 TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed(), std::runtime_error,
516 prefix << "ERROR, input matrix B must not be locally indexed!");
517
518 using Teuchos::ParameterList;
520 transposeParams->set("sort", false);
521
523 if (transposeA) {
525 Aprime = transposer.createTranspose(transposeParams);
526 } else {
528 }
529
530 size_t a_numEntries;
531 typename crs_matrix_type::nonconst_global_inds_host_view_type a_inds("a_inds", A.getLocalMaxNumRowEntries());
532 typename crs_matrix_type::nonconst_values_host_view_type a_vals("a_vals", A.getLocalMaxNumRowEntries());
533 GO row;
534
535 if (scalarB != Teuchos::ScalarTraits<SC>::one())
536 B.scale(scalarB);
537
538 size_t numMyRows = B.getLocalNumRows();
539 if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
540 for (LO i = 0; (size_t)i < numMyRows; ++i) {
541 row = B.getRowMap()->getGlobalElement(i);
542 Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
543
544 if (scalarA != Teuchos::ScalarTraits<SC>::one()) {
545 for (size_t j = 0; j < a_numEntries; ++j)
546 a_vals[j] *= scalarA;
547 }
548 B.insertGlobalValues(row, a_numEntries, reinterpret_cast<Scalar*>(a_vals.data()), a_inds.data());
549 }
550 }
551}
552
553template <class Scalar,
554 class LocalOrdinal,
555 class GlobalOrdinal,
556 class Node>
557Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>>
559 const bool transposeA,
561 const Scalar& beta,
562 const bool transposeB,
564 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>& domainMap,
565 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>& rangeMap,
566 const Teuchos::RCP<Teuchos::ParameterList>& params) {
567 using Teuchos::ParameterList;
568 using Teuchos::RCP;
569 using Teuchos::rcp;
570 using Teuchos::rcpFromRef;
572 if (!params.is_null()) {
574 params->isParameter("Call fillComplete") && !params->get<bool>("Call fillComplete"),
575 std::invalid_argument,
576 "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
577 "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
578 params->set("Call fillComplete", true);
579 }
580 // If transposeB, must compute B's explicit transpose to
581 // get the correct row map for C.
583 if (transposeB) {
585 Brcp = transposer.createTranspose();
586 }
587 // Check that A,B are fillComplete before getting B's column map
588 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete() || !Brcp->isFillComplete(), std::invalid_argument,
589 "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
590 RCP<crs_matrix_type> C = rcp(new crs_matrix_type(Brcp->getRowMap(), 0));
591 // this version of add() always fill completes the result, no matter what is in params on input
592 add(alpha, transposeA, A, beta, false, *Brcp, *C, domainMap, rangeMap, params);
593 return C;
594}
595
596// This functor does the same thing as CrsGraph::convertColumnIndicesFromGlobalToLocal,
597// but since the spadd() output is always packed there is no need for a separate
598// numRowEntries here.
599//
600template <class LO, class GO, class LOView, class GOView, class LocalMap>
601struct ConvertGlobalToLocalFunctor {
602 ConvertGlobalToLocalFunctor(LOView& lids_, const GOView& gids_, const LocalMap localColMap_)
603 : lids(lids_)
604 , gids(gids_)
605 , localColMap(localColMap_) {}
606
607 KOKKOS_FUNCTION void operator()(const GO i) const {
608 lids(i) = localColMap.getLocalElement(gids(i));
609 }
610
611 LOView lids;
612 const GOView gids;
613 const LocalMap localColMap;
614};
615
616template <class Scalar,
617 class LocalOrdinal,
618 class GlobalOrdinal,
619 class Node>
620void add(const Scalar& alpha,
621 const bool transposeA,
623 const Scalar& beta,
624 const bool transposeB,
627 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>& domainMap,
628 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>& rangeMap,
629 const Teuchos::RCP<Teuchos::ParameterList>& params) {
630 using Teuchos::RCP;
631 using Teuchos::rcp;
632 using Teuchos::rcp_dynamic_cast;
633 using Teuchos::rcp_implicit_cast;
634 using Teuchos::rcpFromRef;
635 using Teuchos::TimeMonitor;
636 using SC = Scalar;
637 using LO = LocalOrdinal;
638 using GO = GlobalOrdinal;
639 using NO = Node;
640 using crs_matrix_type = CrsMatrix<SC, LO, GO, NO>;
641 using crs_graph_type = CrsGraph<LO, GO, NO>;
642 using map_type = Map<LO, GO, NO>;
644 using import_type = Import<LO, GO, NO>;
645 using export_type = Export<LO, GO, NO>;
646 using exec_space = typename crs_graph_type::execution_space;
647 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
648 const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
649 constexpr bool debug = false;
650
652
653 if (debug) {
654 std::ostringstream os;
655 os << "Proc " << A.getMap()->getComm()->getRank() << ": "
656 << "TpetraExt::MatrixMatrix::add" << std::endl;
657 std::cerr << os.str();
658 }
659
660 TEUCHOS_TEST_FOR_EXCEPTION(C.isLocallyIndexed() || C.isGloballyIndexed(), std::invalid_argument,
661 prefix_mmm << "C must be a 'new' matrix (neither locally nor globally indexed).");
662 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete() || !B.isFillComplete(), std::invalid_argument,
663 prefix_mmm << "A and B must both be fill complete.");
664#ifdef HAVE_TPETRA_DEBUG
665 // The matrices don't have domain or range Maps unless they are fill complete.
666 if (A.isFillComplete() && B.isFillComplete()) {
667 const bool domainMapsSame =
668 (!transposeA && !transposeB &&
669 !A.getDomainMap()->locallySameAs(*B.getDomainMap())) ||
670 (!transposeA && transposeB &&
671 !A.getDomainMap()->isSameAs(*B.getRangeMap())) ||
672 (transposeA && !transposeB &&
673 !A.getRangeMap()->isSameAs(*B.getDomainMap()));
674 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
675 prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
676
677 const bool rangeMapsSame =
678 (!transposeA && !transposeB &&
679 !A.getRangeMap()->isSameAs(*B.getRangeMap())) ||
680 (!transposeA && transposeB &&
681 !A.getRangeMap()->isSameAs(*B.getDomainMap())) ||
682 (transposeA && !transposeB &&
683 !A.getDomainMap()->isSameAs(*B.getRangeMap()));
684 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
685 prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
686 }
687#endif // HAVE_TPETRA_DEBUG
688
689 using Teuchos::ParameterList;
690 // Form the explicit transpose of A if necessary.
692 if (transposeA) {
694 Aprime = transposer.createTranspose();
695 }
696
697#ifdef HAVE_TPETRA_DEBUG
698 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null(), std::logic_error,
699 prefix_mmm << "Failed to compute Op(A). "
700 "Please report this bug to the Tpetra developers.");
701#endif // HAVE_TPETRA_DEBUG
702
703 // Form the explicit transpose of B if necessary.
705 if (transposeB) {
706 if (debug) {
707 std::ostringstream os;
708 os << "Proc " << A.getMap()->getComm()->getRank() << ": "
709 << "Form explicit xpose of B" << std::endl;
710 std::cerr << os.str();
711 }
713 Bprime = transposer.createTranspose();
714 }
715#ifdef HAVE_TPETRA_DEBUG
716 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null(), std::logic_error,
717 prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
719 !Aprime->isFillComplete() || !Bprime->isFillComplete(), std::invalid_argument,
720 prefix_mmm << "Aprime and Bprime must both be fill complete. "
721 "Please report this bug to the Tpetra developers.");
722#endif // HAVE_TPETRA_DEBUG
725 if (CDomainMap.is_null()) {
726 CDomainMap = Bprime->getDomainMap();
727 }
728 if (CRangeMap.is_null()) {
729 CRangeMap = Bprime->getRangeMap();
730 }
731 assert(!(CDomainMap.is_null()));
732 assert(!(CRangeMap.is_null()));
733 typedef typename AddKern::values_array values_array;
734 typedef typename AddKern::row_ptrs_array row_ptrs_array;
735 typedef typename AddKern::col_inds_array col_inds_array;
736 bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
737 bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
738 values_array vals;
739 row_ptrs_array rowptrs;
740 col_inds_array colinds;
741
742 MM = Teuchos::null;
743 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: rowmap check/import"));
744
745 if (!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap())))) {
746 // import Aprime into Bprime's row map so the local matrices have same # of rows
747 auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
748 // cbl do not set
749 // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
750 // this import _may_ take the form of a transfer. In practice it would be unlikely,
751 // but the general case is not so forgiving.
752 Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
753 }
754 bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
756 RCP<const import_type> Cimport = Teuchos::null;
757 RCP<export_type> Cexport = Teuchos::null;
758 bool doFillComplete = true;
759 if (Teuchos::nonnull(params) && params->isParameter("Call fillComplete")) {
760 doFillComplete = params->get<bool>("Call fillComplete");
761 }
762 auto Alocal = Aprime->getLocalMatrixDevice();
763 auto Blocal = Bprime->getLocalMatrixDevice();
764 LO numLocalRows = Alocal.numRows();
765 if (numLocalRows == 0) {
766 // KokkosKernels spadd assumes rowptrs.extent(0) + 1 == nrows,
767 // but an empty Tpetra matrix is allowed to have rowptrs.extent(0) == 0.
768 // Handle this case now
769 //(without interfering with collective operations, since it's possible for
770 // some ranks to have 0 local rows and others not).
771 rowptrs = row_ptrs_array("C rowptrs", 0);
772 }
773 auto Acolmap = Aprime->getColMap();
774 auto Bcolmap = Bprime->getColMap();
775 if (!matchingColMaps) {
776 using global_col_inds_array = typename AddKern::global_col_inds_array;
777 MM = Teuchos::null;
778 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: mismatched col map full kernel"));
779
780 // use kernel that converts col indices in both A and B to common domain map before adding
781 auto AlocalColmap = Acolmap->getLocalMap();
782 auto BlocalColmap = Bcolmap->getLocalMap();
783 global_col_inds_array globalColinds;
784 if (debug) {
785 std::ostringstream os;
786 os << "Proc " << A.getMap()->getComm()->getRank() << ": "
787 << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
788 std::cerr << os.str();
789 }
790 AddKern::convertToGlobalAndAdd(
793 if (debug) {
794 std::ostringstream os;
795 os << "Proc " << A.getMap()->getComm()->getRank() << ": "
796 << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
797 std::cerr << os.str();
798 }
799 MM = Teuchos::null;
800 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: Constructing graph"));
801
803 Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>(CcolMap, CDomainMap, globalColinds);
804 C.replaceColMap(CcolMap);
805 col_inds_array localColinds("C colinds", globalColinds.extent(0));
806 Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
807 ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
808 col_inds_array, global_col_inds_array,
809 typename map_type::local_map_type>(localColinds, globalColinds, CcolMap->getLocalMap()));
810 Import_Util::sortCrsEntries(rowptrs, localColinds, vals);
811 C.setAllValues(rowptrs, localColinds, vals);
812 C.fillComplete(CDomainMap, CRangeMap, params);
813 if (!doFillComplete)
814 C.resumeFill();
815 } else {
816 // Aprime, Bprime and C all have the same column maps
817 auto Avals = Alocal.values;
818 auto Bvals = Blocal.values;
819 auto Arowptrs = Alocal.graph.row_map;
820 auto Browptrs = Blocal.graph.row_map;
821 auto Acolinds = Alocal.graph.entries;
822 auto Bcolinds = Blocal.graph.entries;
823 if (sorted) {
824 // use sorted kernel
825 MM = Teuchos::null;
826 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: sorted entries full kernel"));
827 if (debug) {
828 std::ostringstream os;
829 os << "Proc " << A.getMap()->getComm()->getRank() << ": "
830 << "Call AddKern::addSorted(...)" << std::endl;
831 std::cerr << os.str();
832 }
833 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
834 } else {
835 // use unsorted kernel
836 MM = Teuchos::null;
837 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: unsorted entries full kernel"));
838
839 if (debug) {
840 std::ostringstream os;
841 os << "Proc " << A.getMap()->getComm()->getRank() << ": "
842 << "Call AddKern::addUnsorted(...)" << std::endl;
843 std::cerr << os.str();
844 }
845 AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
846 }
847 // Bprime col map works as C's row map, since Aprime and Bprime have the same colmaps.
849 C.replaceColMap(Ccolmap);
850 C.setAllValues(rowptrs, colinds, vals);
851 if (doFillComplete) {
852 MM = Teuchos::null;
853 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: expertStaticFillComplete"));
854 if (!CDomainMap->isSameAs(*Ccolmap)) {
855 if (debug) {
856 std::ostringstream os;
857 os << "Proc " << A.getMap()->getComm()->getRank() << ": "
858 << "Create Cimport" << std::endl;
859 std::cerr << os.str();
860 }
861 Cimport = rcp(new import_type(CDomainMap, Ccolmap));
862 }
863 if (!C.getRowMap()->isSameAs(*CRangeMap)) {
864 if (debug) {
865 std::ostringstream os;
866 os << "Proc " << A.getMap()->getComm()->getRank() << ": "
867 << "Create Cexport" << std::endl;
868 std::cerr << os.str();
869 }
870 Cexport = rcp(new export_type(C.getRowMap(), CRangeMap));
871 }
872
873 if (debug) {
874 std::ostringstream os;
875 os << "Proc " << A.getMap()->getComm()->getRank() << ": "
876 << "Call C->expertStaticFillComplete(...)" << std::endl;
877 std::cerr << os.str();
878 }
879 C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
880 }
881 }
882}
883
884// This version of Add takes C as RCP&, so C may be null on input (in this case,
885// it is allocated and constructed in this function).
886template <class Scalar,
887 class LocalOrdinal,
888 class GlobalOrdinal,
889 class Node>
890void Add(
892 bool transposeA,
895 bool transposeB,
898 using std::endl;
899 using Teuchos::Array;
900 using Teuchos::ArrayRCP;
901 using Teuchos::ArrayView;
902 using Teuchos::RCP;
903 using Teuchos::rcp;
904 using Teuchos::rcp_dynamic_cast;
905 using Teuchos::rcpFromRef;
906 using Teuchos::tuple;
907 // typedef typename ArrayView<const Scalar>::size_type size_type;
908 typedef Teuchos::ScalarTraits<Scalar> STS;
910 // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
911 // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
912 // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
915
916 std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
917
919 !A.isFillComplete() || !B.isFillComplete(), std::invalid_argument,
920 prefix << "A and B must both be fill complete before calling this function.");
921
922 if (C.is_null()) {
923 TEUCHOS_TEST_FOR_EXCEPTION(!A.haveGlobalConstants(), std::logic_error,
924 prefix << "C is null (must be allocated), but A.haveGlobalConstants() is false. "
925 "Please report this bug to the Tpetra developers.");
926 TEUCHOS_TEST_FOR_EXCEPTION(!B.haveGlobalConstants(), std::logic_error,
927 prefix << "C is null (must be allocated), but B.haveGlobalConstants() is false. "
928 "Please report this bug to the Tpetra developers.");
929 }
930
931#ifdef HAVE_TPETRA_DEBUG
932 {
933 const bool domainMapsSame =
934 (!transposeA && !transposeB && !A.getDomainMap()->isSameAs(*(B.getDomainMap()))) ||
935 (!transposeA && transposeB && !A.getDomainMap()->isSameAs(*(B.getRangeMap()))) ||
936 (transposeA && !transposeB && !A.getRangeMap()->isSameAs(*(B.getDomainMap())));
937 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
938 prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
939
940 const bool rangeMapsSame =
941 (!transposeA && !transposeB && !A.getRangeMap()->isSameAs(*(B.getRangeMap()))) ||
942 (!transposeA && transposeB && !A.getRangeMap()->isSameAs(*(B.getDomainMap()))) ||
943 (transposeA && !transposeB && !A.getDomainMap()->isSameAs(*(B.getRangeMap())));
944 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
945 prefix << "The range Maps of Op(A) and Op(B) are not the same.");
946 }
947#endif // HAVE_TPETRA_DEBUG
948
949 using Teuchos::ParameterList;
951 transposeParams->set("sort", false);
952
953 // Form the explicit transpose of A if necessary.
955 if (transposeA) {
957 Aprime = theTransposer.createTranspose(transposeParams);
958 } else {
960 }
961
962#ifdef HAVE_TPETRA_DEBUG
963 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null(), std::logic_error,
964 prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
965#endif // HAVE_TPETRA_DEBUG
966
967 // Form the explicit transpose of B if necessary.
969 if (transposeB) {
971 Bprime = theTransposer.createTranspose(transposeParams);
972 } else {
974 }
975
976#ifdef HAVE_TPETRA_DEBUG
977 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null(), std::logic_error,
978 prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
979#endif // HAVE_TPETRA_DEBUG
980
981 bool CwasFillComplete = false;
982
983 // Allocate or zero the entries of the result matrix.
984 if (!C.is_null()) {
985 CwasFillComplete = C->isFillComplete();
987 C->resumeFill();
988 C->setAllToScalar(STS::zero());
989 } else {
990 // FIXME (mfh 08 May 2013) When I first looked at this method, I
991 // noticed that C was being given the row Map of Aprime (the
992 // possibly transposed version of A). Is this what we want?
993
994 // It is a precondition that Aprime and Bprime have the same domain and range maps.
995 // However, they may have different row maps. In this case, it's difficult to
996 // get a precise upper bound on the number of entries in each local row of C, so
997 // just use the looser upper bound based on the max number of entries in any row of Aprime and Bprime.
998 if (Aprime->getRowMap()->isSameAs(*Bprime->getRowMap())) {
999 LocalOrdinal numLocalRows = Aprime->getLocalNumRows();
1001 for (LocalOrdinal i = 0; i < numLocalRows; i++) {
1002 CmaxEntriesPerRow[i] = Aprime->getNumEntriesInLocalRow(i) + Bprime->getNumEntriesInLocalRow(i);
1003 }
1004 C = rcp(new crs_matrix_type(Aprime->getRowMap(), CmaxEntriesPerRow()));
1005 } else {
1006 // Note: above we checked that Aprime and Bprime have global constants, so it's safe to ask for max entries per row.
1007 C = rcp(new crs_matrix_type(Aprime->getRowMap(), Aprime->getGlobalMaxNumRowEntries() + Bprime->getGlobalMaxNumRowEntries()));
1008 }
1009 }
1010
1011#ifdef HAVE_TPETRA_DEBUG
1012 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null(), std::logic_error,
1013 prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1014 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null(), std::logic_error,
1015 prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1016 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null(), std::logic_error,
1017 prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1018#endif // HAVE_TPETRA_DEBUG
1019
1023
1024 // do a loop over each matrix to add: A reordering might be more efficient
1025 for (int k = 0; k < 2; ++k) {
1026 typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1027 typename crs_matrix_type::nonconst_values_host_view_type Values;
1028
1029 // Loop over each locally owned row of the current matrix (either
1030 // Aprime or Bprime), and sum its entries into the corresponding
1031 // row of C. This works regardless of whether Aprime or Bprime
1032 // has the same row Map as C, because both sumIntoGlobalValues and
1033 // insertGlobalValues allow summing resp. inserting into nonowned
1034 // rows of C.
1035#ifdef HAVE_TPETRA_DEBUG
1036 TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null(), std::logic_error,
1037 prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1038#endif // HAVE_TPETRA_DEBUG
1039 RCP<const map_type> curRowMap = Mat[k]->getRowMap();
1040#ifdef HAVE_TPETRA_DEBUG
1041 TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null(), std::logic_error,
1042 prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1043#endif // HAVE_TPETRA_DEBUG
1044
1045 const size_t localNumRows = Mat[k]->getLocalNumRows();
1046 for (size_t i = 0; i < localNumRows; ++i) {
1047 const GlobalOrdinal globalRow = curRowMap->getGlobalElement(i);
1048 size_t numEntries = Mat[k]->getNumEntriesInGlobalRow(globalRow);
1049 if (numEntries > 0) {
1050 if (numEntries > Indices.extent(0)) {
1051 Kokkos::resize(Indices, numEntries);
1052 Kokkos::resize(Values, numEntries);
1053 }
1054 Mat[k]->getGlobalRowCopy(globalRow, Indices, Values, numEntries);
1055
1056 if (scalar[k] != STS::one()) {
1057 for (size_t j = 0; j < numEntries; ++j) {
1058 Values[j] *= scalar[k];
1059 }
1060 }
1061
1062 if (CwasFillComplete) {
1063 size_t result = C->sumIntoGlobalValues(globalRow, numEntries,
1064 reinterpret_cast<Scalar*>(Values.data()), Indices.data());
1065 TEUCHOS_TEST_FOR_EXCEPTION(result != numEntries, std::logic_error,
1066 prefix << "sumIntoGlobalValues failed to add entries from A or B into C.");
1067 } else {
1068 C->insertGlobalValues(globalRow, numEntries,
1069 reinterpret_cast<Scalar*>(Values.data()), Indices.data());
1070 }
1071 }
1072 }
1073 }
1074 if (CwasFillComplete) {
1075 C->fillComplete(C->getDomainMap(),
1076 C->getRangeMap());
1077 }
1078}
1079
1080// This version of Add takes C as const RCP&, so C must not be null on input. Otherwise, its behavior is identical
1081// to the above version where C is RCP&.
1082template <class Scalar,
1083 class LocalOrdinal,
1084 class GlobalOrdinal,
1085 class Node>
1086void Add(
1088 bool transposeA,
1091 bool transposeB,
1094 std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
1095
1096 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null(), std::invalid_argument,
1097 prefix << "C must not be null");
1098
1099 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>> C_ = C;
1101}
1102
1103} // End namespace MatrixMatrix
1104
1105namespace MMdetails {
1106
1107/*********************************************************************************************************/
1109// template <class TransferType>
1110// void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1111// if (Transfer.is_null())
1112// return;
1113//
1114// const Distributor & Distor = Transfer->getDistributor();
1115// Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1116//
1117// size_t rows_send = Transfer->getNumExportIDs();
1118// size_t rows_recv = Transfer->getNumRemoteIDs();
1119//
1120// size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1121// size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1122// size_t num_send_neighbors = Distor.getNumSends();
1123// size_t num_recv_neighbors = Distor.getNumReceives();
1124// size_t round2_send, round2_recv;
1125// Distor.getLastDoStatistics(round2_send,round2_recv);
1126//
1127// int myPID = Comm->getRank();
1128// int NumProcs = Comm->getSize();
1129//
1130// // Processor by processor statistics
1131// // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1132// // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1133//
1134// // Global statistics
1135// size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1136// size_t gstats_min[8], gstats_max[8];
1137//
1138// double lstats_avg[8], gstats_avg[8];
1139// for(int i=0; i<8; i++)
1140// lstats_avg[i] = ((double)lstats[i])/NumProcs;
1141//
1142// Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1143// Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1144// Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1145//
1146// if(!myPID) {
1147// printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1148// (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1149// (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1150// printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1151// (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1152// (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1153// }
1154// }
1155
1156// Kernel method for computing the local portion of C = A*B
1157template <class Scalar,
1158 class LocalOrdinal,
1159 class GlobalOrdinal,
1160 class Node>
1161void mult_AT_B_newmatrix(
1162 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1163 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1164 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1165 const std::string& label,
1166 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1167 using Teuchos::RCP;
1168 using Teuchos::rcp;
1169 typedef Scalar SC;
1170 typedef LocalOrdinal LO;
1171 typedef GlobalOrdinal GO;
1172 typedef Node NO;
1173 typedef CrsMatrixStruct<SC, LO, GO, NO> crs_matrix_struct_type;
1174 typedef RowMatrixTransposer<SC, LO, GO, NO> transposer_type;
1175
1176 RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM-T: Transpose"));
1177
1178 /*************************************************************/
1179 /* 1) Local Transpose of A */
1180 /*************************************************************/
1181 transposer_type transposer(rcpFromRef(A), label + std::string("XP: "));
1182
1183 using Teuchos::ParameterList;
1184 RCP<ParameterList> transposeParams(new ParameterList);
1185 transposeParams->set("sort", true); // Kokkos Kernels spgemm requires inputs to be sorted
1186 if (!params.is_null()) {
1187 transposeParams->set("compute global constants",
1188 params->get("compute global constants: temporaries",
1189 false));
1190 }
1191 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1192 transposer.createTransposeLocal(transposeParams);
1193
1194 /*************************************************************/
1195 /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1196 /*************************************************************/
1197 MM = Teuchos::null;
1198 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM-T: I&X"));
1199
1200 // Get views, asserting that no import is required to speed up computation
1201 crs_matrix_struct_type Aview;
1202 crs_matrix_struct_type Bview;
1203 RCP<const Import<LO, GO, NO>> dummyImporter;
1204
1205 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1206 RCP<Teuchos::ParameterList> importParams1(new ParameterList);
1207 if (!params.is_null()) {
1208 importParams1->set("compute global constants",
1209 params->get("compute global constants: temporaries",
1210 false));
1211 auto slist = params->sublist("matrixmatrix: kernel params", false);
1212 bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete", false);
1213 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1214 int mm_optimization_core_count =
1216 mm_optimization_core_count =
1217 slist.get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1218 int mm_optimization_core_count2 =
1219 params->get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1220 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1221 mm_optimization_core_count = mm_optimization_core_count2;
1222 }
1223 auto& sip1 = importParams1->sublist("matrixmatrix: kernel params", false);
1224 sip1.set("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1225 sip1.set("isMatrixMatrix_TransferAndFillComplete", isMM);
1226 sip1.set("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1227 }
1228
1229 MMdetails::import_and_extract_views(*Atrans, Atrans->getRowMap(),
1230 Aview, dummyImporter, true,
1231 label, importParams1);
1232
1233 RCP<ParameterList> importParams2(new ParameterList);
1234 if (!params.is_null()) {
1235 importParams2->set("compute global constants",
1236 params->get("compute global constants: temporaries",
1237 false));
1238 auto slist = params->sublist("matrixmatrix: kernel params", false);
1239 bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete", false);
1240 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1241 int mm_optimization_core_count =
1243 mm_optimization_core_count =
1244 slist.get("MM_TAFC_OptimizationCoreCount",
1245 mm_optimization_core_count);
1246 int mm_optimization_core_count2 =
1247 params->get("MM_TAFC_OptimizationCoreCount",
1248 mm_optimization_core_count);
1249 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1250 mm_optimization_core_count = mm_optimization_core_count2;
1251 }
1252 auto& sip2 = importParams2->sublist("matrixmatrix: kernel params", false);
1253 sip2.set("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1254 sip2.set("isMatrixMatrix_TransferAndFillComplete", isMM);
1255 sip2.set("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1256 }
1257
1258 if (B.getRowMap()->isSameAs(*Atrans->getColMap())) {
1259 MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter, true, label, importParams2);
1260 } else {
1261 MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter, false, label, importParams2);
1262 }
1263
1264 MM = Teuchos::null;
1265 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM-T: AB-core"));
1266
1267 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1268
1269 // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1270 bool needs_final_export = !Atrans->getGraph()->getExporter().is_null();
1271 if (needs_final_export) {
1272 Ctemp = rcp(new Tpetra::CrsMatrix<SC, LO, GO, NO>(Atrans->getRowMap(), 0));
1273 } else {
1274 Ctemp = rcp(&C, false);
1275 }
1276
1277 mult_A_B_newmatrix(Aview, Bview, *Ctemp, label, params);
1278
1279 /*************************************************************/
1280 /* 4) exportAndFillComplete matrix */
1281 /*************************************************************/
1282 MM = Teuchos::null;
1283 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM-T: exportAndFillComplete"));
1284
1285 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp(&C, false);
1286
1287 if (needs_final_export) {
1288 ParameterList labelList;
1289 labelList.set("Timer Label", label);
1290 if (!params.is_null()) {
1291 ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params", false);
1292 ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params", false);
1293 int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1294 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1295 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1296 if (mm_optimization_core_count2 < mm_optimization_core_count) mm_optimization_core_count = mm_optimization_core_count2;
1297 labelList_subList.set("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count, "Core Count above which the optimized neighbor discovery is used");
1298 bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete", false);
1299 bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck", false);
1300
1301 labelList_subList.set("isMatrixMatrix_TransferAndFillComplete", isMM,
1302 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1303 labelList.set("compute global constants", params->get("compute global constants", true));
1304 labelList.set("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1305 }
1306 Ctemp->exportAndFillComplete(Crcp,
1307 *Ctemp->getGraph()->getExporter(),
1308 B.getDomainMap(),
1309 A.getDomainMap(),
1310 rcp(&labelList, false));
1311 }
1312#ifdef HAVE_TPETRA_MMM_STATISTICS
1313 printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label + std::string(" AT_B MMM"));
1314#endif
1315}
1316
1317/*********************************************************************************************************/
1318// Kernel method for computing the local portion of C = A*B
1319template <class Scalar,
1320 class LocalOrdinal,
1321 class GlobalOrdinal,
1322 class Node>
1323void mult_A_B(
1324 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1325 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1326 CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1327 const std::string& /* label */,
1328 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1329 using Teuchos::Array;
1330 using Teuchos::ArrayRCP;
1331 using Teuchos::ArrayView;
1332 using Teuchos::null;
1333 using Teuchos::OrdinalTraits;
1334
1335 bool skipExplicitZero = true;
1336 if (params && params->isParameter("MM Skip Explicit Zeros")) {
1337 skipExplicitZero = params->get<bool>("MM Skip Explicit Zeros");
1338 }
1339
1340 typedef Teuchos::ScalarTraits<Scalar> STS;
1341 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1342 LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1343 LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1344
1345 LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1346 LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1347
1348 ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getLocalElementList();
1349 ArrayView<const GlobalOrdinal> bcols_import = null;
1350 if (Bview.importColMap != null) {
1351 C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1352 C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1353
1354 bcols_import = Bview.importColMap->getLocalElementList();
1355 }
1356
1357 size_t C_numCols = C_lastCol - C_firstCol +
1358 OrdinalTraits<LocalOrdinal>::one();
1359 size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1360 OrdinalTraits<LocalOrdinal>::one();
1361
1362 if (C_numCols_import > C_numCols)
1363 C_numCols = C_numCols_import;
1364
1365 Array<Scalar> dwork = Array<Scalar>(C_numCols);
1366 Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1367 Array<size_t> iwork2 = Array<size_t>(C_numCols);
1368
1369 Array<Scalar> C_row_i = dwork;
1370 Array<GlobalOrdinal> C_cols = iwork;
1371 Array<size_t> c_index = iwork2;
1372 Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2 * C_numCols);
1373 Array<Scalar> combined_values = Array<Scalar>(2 * C_numCols);
1374
1375 size_t C_row_i_length, j, k, last_index;
1376
1377 // Run through all the hash table lookups once and for all
1378 LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1379 Array<LocalOrdinal> Acol2Brow(Aview.colMap->getLocalNumElements(), LO_INVALID);
1380 Array<LocalOrdinal> Acol2Irow(Aview.colMap->getLocalNumElements(), LO_INVALID);
1381 if (Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())) {
1382 // Maps are the same: Use local IDs as the hash
1383 for (LocalOrdinal i = Aview.colMap->getMinLocalIndex(); i <=
1384 Aview.colMap->getMaxLocalIndex();
1385 i++)
1386 Acol2Brow[i] = i;
1387 } else {
1388 // Maps are not the same: Use the map's hash
1389 for (LocalOrdinal i = Aview.colMap->getMinLocalIndex(); i <=
1390 Aview.colMap->getMaxLocalIndex();
1391 i++) {
1392 GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1393 LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1394 if (BLID != LO_INVALID)
1395 Acol2Brow[i] = BLID;
1396 else
1397 Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1398 }
1399 }
1400
1401 // To form C = A*B we're going to execute this expression:
1402 //
1403 // C(i,j) = sum_k( A(i,k)*B(k,j) )
1404 //
1405 // Our goal, of course, is to navigate the data in A and B once, without
1406 // performing searches for column-indices, etc.
1407 auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1408 auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1409 auto Avals = Aview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1410 auto Browptr = Bview.origMatrix->getLocalRowPtrsHost();
1411 auto Bcolind = Bview.origMatrix->getLocalIndicesHost();
1412 auto Bvals = Bview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1413 decltype(Browptr) Irowptr;
1414 decltype(Bcolind) Icolind;
1415 decltype(Bvals) Ivals;
1416 if (!Bview.importMatrix.is_null()) {
1417 Irowptr = Bview.importMatrix->getLocalRowPtrsHost();
1418 Icolind = Bview.importMatrix->getLocalIndicesHost();
1419 Ivals = Bview.importMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1420 }
1421
1422 bool C_filled = C.isFillComplete();
1423
1424 for (size_t i = 0; i < C_numCols; i++)
1425 c_index[i] = OrdinalTraits<size_t>::invalid();
1426
1427 // Loop over the rows of A.
1428 size_t Arows = Aview.rowMap->getLocalNumElements();
1429 for (size_t i = 0; i < Arows; ++i) {
1430 // Only navigate the local portion of Aview... which is, thankfully, all of
1431 // A since this routine doesn't do transpose modes
1432 GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1433
1434 // Loop across the i-th row of A and for each corresponding row in B, loop
1435 // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1436 // quantities C_row_i. In other words, as we stride across B(k,:) we're
1437 // calculating updates for row i of the result matrix C.
1438 C_row_i_length = OrdinalTraits<size_t>::zero();
1439
1440 for (k = Arowptr[i]; k < Arowptr[i + 1]; ++k) {
1441 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1442 const Scalar Aval = Avals[k];
1443 if (Aval == STS::zero() && skipExplicitZero)
1444 continue;
1445
1446 if (Ak == LO_INVALID)
1447 continue;
1448
1449 for (j = Browptr[Ak]; j < Browptr[Ak + 1]; ++j) {
1450 LocalOrdinal col = Bcolind[j];
1451 // assert(col >= 0 && col < C_numCols);
1452
1453 if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1454 // assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1455 // This has to be a += so insertGlobalValue goes out
1456 C_row_i[C_row_i_length] = Aval * Bvals[j];
1457 C_cols[C_row_i_length] = col;
1458 c_index[col] = C_row_i_length;
1459 C_row_i_length++;
1460
1461 } else {
1462 // static cast from impl_scalar_type to Scalar needed for complex
1463 C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Bvals[j]);
1464 }
1465 }
1466 }
1467
1468 for (size_t ii = 0; ii < C_row_i_length; ii++) {
1469 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1470 C_cols[ii] = bcols[C_cols[ii]];
1471 combined_index[ii] = C_cols[ii];
1472 combined_values[ii] = C_row_i[ii];
1473 }
1474 last_index = C_row_i_length;
1475
1476 //
1477 // Now put the C_row_i values into C.
1478 //
1479 // We might have to revamp this later.
1480 C_row_i_length = OrdinalTraits<size_t>::zero();
1481
1482 for (k = Arowptr[i]; k < Arowptr[i + 1]; ++k) {
1483 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1484 const Scalar Aval = Avals[k];
1485 if (Aval == STS::zero() && skipExplicitZero)
1486 continue;
1487
1488 if (Ak != LO_INVALID) continue;
1489
1490 Ak = Acol2Irow[Acolind[k]];
1491 for (j = Irowptr[Ak]; j < Irowptr[Ak + 1]; ++j) {
1492 LocalOrdinal col = Icolind[j];
1493 // assert(col >= 0 && col < C_numCols);
1494
1495 if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1496 // assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1497 // This has to be a += so insertGlobalValue goes out
1498 C_row_i[C_row_i_length] = Aval * Ivals[j];
1499 C_cols[C_row_i_length] = col;
1500 c_index[col] = C_row_i_length;
1501 C_row_i_length++;
1502
1503 } else {
1504 // This has to be a += so insertGlobalValue goes out
1505 // static cast from impl_scalar_type to Scalar needed for complex
1506 C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Ivals[j]);
1507 }
1508 }
1509 }
1510
1511 for (size_t ii = 0; ii < C_row_i_length; ii++) {
1512 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1513 C_cols[ii] = bcols_import[C_cols[ii]];
1514 combined_index[last_index] = C_cols[ii];
1515 combined_values[last_index] = C_row_i[ii];
1516 last_index++;
1517 }
1518
1519 // Now put the C_row_i values into C.
1520 // We might have to revamp this later.
1521 C_filled ? C.sumIntoGlobalValues(
1522 global_row,
1523 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1524 combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1525 : C.insertGlobalValues(
1526 global_row,
1527 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1528 combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1529 }
1530}
1531
1532/*********************************************************************************************************/
1533template <class Scalar,
1534 class LocalOrdinal,
1535 class GlobalOrdinal,
1536 class Node>
1537void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1538 typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal>>::size_type local_length_size;
1539 Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1540
1541 if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1542 Mview.maxNumRowEntries = Mview.indices[0].size();
1543
1544 for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1545 if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1546 Mview.maxNumRowEntries = Mview.indices[i].size();
1547 }
1548}
1549
1550/*********************************************************************************************************/
1551template <class CrsMatrixType>
1552size_t C_estimate_nnz(CrsMatrixType& A, CrsMatrixType& B) {
1553 // Follows the NZ estimate in ML's ml_matmatmult.c
1554 size_t Aest = 100, Best = 100;
1555 if (A.getLocalNumEntries() >= A.getLocalNumRows())
1556 Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 100;
1557 if (B.getLocalNumEntries() >= B.getLocalNumRows())
1558 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries() / B.getLocalNumRows() : 100;
1559
1560 size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1561 nnzperrow *= nnzperrow;
1562
1563 return (size_t)(A.getLocalNumRows() * nnzperrow * 0.75 + 100);
1564}
1565
1566/*********************************************************************************************************/
1567// Kernel method for computing the local portion of C = A*B for CrsMatrix
1568//
1569// mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1570// function, so this is probably the function we want to
1571// thread-parallelize.
1572template <class Scalar,
1573 class LocalOrdinal,
1574 class GlobalOrdinal,
1575 class Node>
1576void mult_A_B_newmatrix(
1577 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1578 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1579 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1580 const std::string& label,
1581 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1582 using Teuchos::Array;
1583 using Teuchos::ArrayRCP;
1584 using Teuchos::ArrayView;
1585 using Teuchos::RCP;
1586 using Teuchos::rcp;
1587
1588 // Tpetra typedefs
1589 typedef LocalOrdinal LO;
1590 typedef GlobalOrdinal GO;
1591 typedef Node NO;
1592 typedef Import<LO, GO, NO> import_type;
1593 typedef Map<LO, GO, NO> map_type;
1594
1595 // Kokkos typedefs
1596 typedef typename map_type::local_map_type local_map_type;
1598 typedef typename KCRS::StaticCrsGraphType graph_t;
1599 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1600 typedef typename NO::execution_space execution_space;
1601 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1602 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1603
1604 Tpetra::Details::ProfilingRegion MM("TpetraExt: MMM: M5 Cmap");
1605
1606 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1607
1608 // Build the final importer / column map, hash table lookups for C
1609 RCP<const import_type> Cimport;
1610 RCP<const map_type> Ccolmap;
1611 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1612 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ? Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1613 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1614 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1615 local_map_type Irowmap_local;
1616 if (!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1617 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1618 local_map_type Icolmap_local;
1619 if (!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1620
1621 // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1622 // indices of B, to local column indices of C. (B and C have the
1623 // same number of columns.) The kernel uses this, instead of
1624 // copying the entire input matrix B and converting its column
1625 // indices to those of C.
1626 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"), Bview.colMap->getLocalNumElements()), Icol2Ccol;
1627
1628 if (Bview.importMatrix.is_null()) {
1629 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1630 Cimport = Bimport;
1631 Ccolmap = Bview.colMap;
1632 const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1633 // Bcol2Ccol is trivial
1634 Kokkos::parallel_for(
1635 "Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1636 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1637 KOKKOS_LAMBDA(const LO i) {
1638 Bcol2Ccol(i) = i;
1639 });
1640 } else {
1641 // mfh 27 Sep 2016: B has "remotes," so we need to build the
1642 // column Map of C, as well as C's Import object (from its domain
1643 // Map to its column Map). C's column Map is the union of the
1644 // column Maps of (the local part of) B, and the "remote" part of
1645 // B. Ditto for the Import. We have optimized this "setUnion"
1646 // operation on Import objects and Maps.
1647
1648 // Choose the right variant of setUnion
1649 if (!Bimport.is_null() && !Iimport.is_null()) {
1650 Cimport = Bimport->setUnion(*Iimport);
1651 } else if (!Bimport.is_null() && Iimport.is_null()) {
1652 Cimport = Bimport->setUnion();
1653 } else if (Bimport.is_null() && !Iimport.is_null()) {
1654 Cimport = Iimport->setUnion();
1655 } else {
1656 throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1657 }
1658 Ccolmap = Cimport->getTargetMap();
1659
1660 // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1661 // in general. We should get rid of it in order to reduce
1662 // communication costs of sparse matrix-matrix multiply.
1663 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1664 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1665
1666 // NOTE: This is not efficient and should be folded into setUnion
1667 //
1668 // mfh 27 Sep 2016: What the above comment means, is that the
1669 // setUnion operation on Import objects could also compute these
1670 // local index - to - local index look-up tables.
1671 Kokkos::resize(Icol2Ccol, Bview.importMatrix->getColMap()->getLocalNumElements());
1672 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1673 Kokkos::parallel_for(
1674 "Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement", range_type(0, Bview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
1675 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1676 });
1677 Kokkos::parallel_for(
1678 "Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement", range_type(0, Bview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
1679 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1680 });
1681 }
1682
1683 // Replace the column map
1684 //
1685 // mfh 27 Sep 2016: We do this because C was originally created
1686 // without a column Map. Now we have its column Map.
1687 C.replaceColMap(Ccolmap);
1688
1689 // mfh 27 Sep 2016: Construct tables that map from local column
1690 // indices of A, to local row indices of either B_local (the locally
1691 // owned part of B), or B_remote (the "imported" remote part of B).
1692 //
1693 // For column index Aik in row i of A, if the corresponding row of B
1694 // exists in the local part of B ("orig") (which I'll call B_local),
1695 // then targetMapToOrigRow[Aik] is the local index of that row of B.
1696 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1697 //
1698 // For column index Aik in row i of A, if the corresponding row of B
1699 // exists in the remote part of B ("Import") (which I'll call
1700 // B_remote), then targetMapToImportRow[Aik] is the local index of
1701 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1702 // (a flag value).
1703
1704 // Run through all the hash table lookups once and for all
1705 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
1706 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
1707
1708 Kokkos::parallel_for(
1709 "Tpetra::mult_A_B_newmatrix::construct_tables", range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
1710 GO aidx = Acolmap_local.getGlobalElement(i);
1711 LO B_LID = Browmap_local.getLocalElement(aidx);
1712 if (B_LID != LO_INVALID) {
1713 targetMapToOrigRow(i) = B_LID;
1714 targetMapToImportRow(i) = LO_INVALID;
1715 } else {
1716 LO I_LID = Irowmap_local.getLocalElement(aidx);
1717 targetMapToOrigRow(i) = LO_INVALID;
1718 targetMapToImportRow(i) = I_LID;
1719 }
1720 });
1721
1722 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1723 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1724 KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview, Bview, targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
1725}
1726
1727/*********************************************************************************************************/
1728// Kernel method for computing the local portion of C = A*B for BlockCrsMatrix
1729template <class Scalar,
1730 class LocalOrdinal,
1731 class GlobalOrdinal,
1732 class Node>
1733void mult_A_B_newmatrix(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1734 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1735 Teuchos::RCP<BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>>& C) {
1736 using Teuchos::Array;
1737 using Teuchos::ArrayRCP;
1738 using Teuchos::ArrayView;
1739 using Teuchos::null;
1740 using Teuchos::RCP;
1741 using Teuchos::rcp;
1742
1743 // Tpetra typedefs
1744 typedef LocalOrdinal LO;
1745 typedef GlobalOrdinal GO;
1746 typedef Node NO;
1747 typedef Import<LO, GO, NO> import_type;
1748 typedef Map<LO, GO, NO> map_type;
1749 typedef BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> block_crs_matrix_type;
1750 typedef typename block_crs_matrix_type::crs_graph_type graph_t;
1751
1752 // Kokkos typedefs
1753 typedef typename map_type::local_map_type local_map_type;
1754 typedef typename block_crs_matrix_type::local_matrix_device_type KBSR;
1755 typedef typename KBSR::device_type device_t;
1756 typedef typename KBSR::StaticCrsGraphType static_graph_t;
1757 typedef typename static_graph_t::row_map_type::non_const_type lno_view_t;
1758 typedef typename static_graph_t::entries_type::non_const_type lno_nnz_view_t;
1759 typedef typename KBSR::values_type::non_const_type scalar_view_t;
1760 typedef typename NO::execution_space execution_space;
1761 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1762 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1763
1764 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1765
1766 // Build the final importer / column map, hash table lookups for C
1767 RCP<const import_type> Cimport;
1768 RCP<const map_type> Ccolmap;
1769 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1770 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ? Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1771 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1772 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1773 local_map_type Irowmap_local;
1774 if (!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1775 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1776 local_map_type Icolmap_local;
1777 if (!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1778
1779 // Bcol2Ccol is a table that maps from local column
1780 // indices of B, to local column indices of C. (B and C have the
1781 // same number of columns.) The kernel uses this, instead of
1782 // copying the entire input matrix B and converting its column
1783 // indices to those of C.
1784 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"), Bview.colMap->getLocalNumElements()), Icol2Ccol;
1785
1786 if (Bview.importMatrix.is_null()) {
1787 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1788 Cimport = Bimport;
1789 Ccolmap = Bview.colMap;
1790 const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1791 // Bcol2Ccol is trivial
1792 Kokkos::parallel_for(
1793 "Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1794 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1795 KOKKOS_LAMBDA(const LO i) {
1796 Bcol2Ccol(i) = i;
1797 });
1798 } else {
1799 // B has "remotes," so we need to build the
1800 // column Map of C, as well as C's Import object (from its domain
1801 // Map to its column Map). C's column Map is the union of the
1802 // column Maps of (the local part of) B, and the "remote" part of
1803 // B. Ditto for the Import. We have optimized this "setUnion"
1804 // operation on Import objects and Maps.
1805
1806 // Choose the right variant of setUnion
1807 if (!Bimport.is_null() && !Iimport.is_null()) {
1808 Cimport = Bimport->setUnion(*Iimport);
1809 } else if (!Bimport.is_null() && Iimport.is_null()) {
1810 Cimport = Bimport->setUnion();
1811 } else if (Bimport.is_null() && !Iimport.is_null()) {
1812 Cimport = Iimport->setUnion();
1813 } else {
1814 throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1815 }
1816 Ccolmap = Cimport->getTargetMap();
1817
1818 // NOTE: This is not efficient and should be folded into setUnion
1819 //
1820 // What the above comment means, is that the
1821 // setUnion operation on Import objects could also compute these
1822 // local index - to - local index look-up tables.
1823 Kokkos::resize(Icol2Ccol, Bview.importMatrix->getColMap()->getLocalNumElements());
1824 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1825 Kokkos::parallel_for(
1826 "Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",
1827 range_type(0, Bview.origMatrix->getColMap()->getLocalNumElements()),
1828 KOKKOS_LAMBDA(const LO i) {
1829 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1830 });
1831 Kokkos::parallel_for(
1832 "Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",
1833 range_type(0, Bview.importMatrix->getColMap()->getLocalNumElements()),
1834 KOKKOS_LAMBDA(const LO i) {
1835 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1836 });
1837 }
1838
1839 // Construct tables that map from local column
1840 // indices of A, to local row indices of either B_local (the locally
1841 // owned part of B), or B_remote (the "imported" remote part of B).
1842 //
1843 // For column index Aik in row i of A, if the corresponding row of B
1844 // exists in the local part of B ("orig") (which I'll call B_local),
1845 // then targetMapToOrigRow[Aik] is the local index of that row of B.
1846 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1847 //
1848 // For column index Aik in row i of A, if the corresponding row of B
1849 // exists in the remote part of B ("Import") (which I'll call
1850 // B_remote), then targetMapToImportRow[Aik] is the local index of
1851 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1852 // (a flag value).
1853
1854 // Run through all the hash table lookups once and for all
1855 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),
1856 Aview.colMap->getLocalNumElements());
1857 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),
1858 Aview.colMap->getLocalNumElements());
1859
1860 Kokkos::parallel_for(
1861 "Tpetra::mult_A_B_newmatrix::construct_tables",
1862 range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1),
1863 KOKKOS_LAMBDA(const LO i) {
1864 GO aidx = Acolmap_local.getGlobalElement(i);
1865 LO B_LID = Browmap_local.getLocalElement(aidx);
1866 if (B_LID != LO_INVALID) {
1867 targetMapToOrigRow(i) = B_LID;
1868 targetMapToImportRow(i) = LO_INVALID;
1869 } else {
1870 LO I_LID = Irowmap_local.getLocalElement(aidx);
1871 targetMapToOrigRow(i) = LO_INVALID;
1872 targetMapToImportRow(i) = I_LID;
1873 }
1874 });
1875
1876 // Create the KernelHandle
1877 using KernelHandle =
1878 KokkosKernels::Experimental::KokkosKernelsHandle<typename lno_view_t::const_value_type,
1879 typename lno_nnz_view_t::const_value_type,
1880 typename scalar_view_t::const_value_type,
1881 typename device_t::execution_space,
1882 typename device_t::memory_space,
1883 typename device_t::memory_space>;
1884 int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
1885 std::string myalg("SPGEMM_KK_MEMORY");
1886 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
1887
1888 KernelHandle kh;
1889 kh.create_spgemm_handle(alg_enum);
1890 kh.set_team_work_size(team_work_size);
1891
1892 // Get KokkosSparse::BsrMatrix for A and Bmerged (B and BImport)
1893 const KBSR Amat = Aview.origMatrix->getLocalMatrixDevice();
1894 const KBSR Bmerged = Tpetra::MMdetails::merge_matrices(Aview, Bview,
1895 targetMapToOrigRow, targetMapToImportRow,
1896 Bcol2Ccol, Icol2Ccol,
1897 Ccolmap.getConst()->getLocalNumElements());
1898
1899 RCP<graph_t> graphC;
1900 typename KBSR::values_type values;
1901 {
1902 // Call KokkosSparse routines to calculate Amat*Bmerged on device.
1903 // NOTE: Need to scope guard this since the BlockCrs constructor will need to copy the host graph
1904 KBSR Cmat;
1905 KokkosSparse::block_spgemm_symbolic(kh, Amat, false, Bmerged, false, Cmat);
1906 KokkosSparse::block_spgemm_numeric(kh, Amat, false, Bmerged, false, Cmat);
1907 kh.destroy_spgemm_handle();
1908
1909 // Build Tpetra::BlockCrsMatrix from KokkosSparse::BsrMatrix
1910 graphC = rcp(new graph_t(Cmat.graph, Aview.origMatrix->getRowMap(), Ccolmap.getConst()));
1911 values = Cmat.values;
1912 }
1913 C = rcp(new block_crs_matrix_type(*graphC, values, Aview.blocksize));
1914}
1915
1916/*********************************************************************************************************/
1917// AB NewMatrix Kernel wrappers (Default non-threaded version for CrsMatrix)
1918template <class Scalar,
1919 class LocalOrdinal,
1920 class GlobalOrdinal,
1921 class Node,
1922 class LocalOrdinalViewType>
1923void KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1924 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1925 const LocalOrdinalViewType& targetMapToOrigRow,
1926 const LocalOrdinalViewType& targetMapToImportRow,
1927 const LocalOrdinalViewType& Bcol2Ccol,
1928 const LocalOrdinalViewType& Icol2Ccol,
1929 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1930 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node>> Cimport,
1931 const std::string& label,
1932 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1933 using Teuchos::Array;
1934 using Teuchos::ArrayRCP;
1935 using Teuchos::ArrayView;
1936 using Teuchos::RCP;
1937 using Teuchos::rcp;
1938
1939 Tpetra::Details::ProfilingRegion MM("TpetraExt: MMM: Newmatrix SerialCore");
1940
1941 // Lots and lots of typedefs
1942 typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
1943 typedef typename KCRS::StaticCrsGraphType graph_t;
1944 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1945 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1946 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1947 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1948
1949 typedef Scalar SC;
1950 typedef LocalOrdinal LO;
1951 typedef GlobalOrdinal GO;
1952 typedef Node NO;
1953 typedef Map<LO, GO, NO> map_type;
1954 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1955 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1956 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1957
1958 bool skipExplicitZero = true;
1959 if (params && params->isParameter("MM Skip Explicit Zeros")) {
1960 skipExplicitZero = params->get<bool>("MM Skip Explicit Zeros");
1961 }
1962
1963 // Sizes
1964 RCP<const map_type> Ccolmap = C.getColMap();
1965 size_t m = Aview.origMatrix->getLocalNumRows();
1966 size_t n = Ccolmap->getLocalNumElements();
1967 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
1968
1969 // Grab the Kokkos::SparseCrsMatrices & inner stuff
1970 const KCRS Amat = Aview.origMatrix->getLocalMatrixHost();
1971 const KCRS Bmat = Bview.origMatrix->getLocalMatrixHost();
1972
1973 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
1974 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
1975 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
1976
1977 c_lno_view_t Irowptr;
1978 lno_nnz_view_t Icolind;
1979 scalar_view_t Ivals;
1980 if (!Bview.importMatrix.is_null()) {
1981 auto lclB = Bview.importMatrix->getLocalMatrixHost();
1982 Irowptr = lclB.graph.row_map;
1983 Icolind = lclB.graph.entries;
1984 Ivals = lclB.values;
1985 b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
1986 }
1987
1988 // Classic csr assembly (low memory edition)
1989 //
1990 // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
1991 // The method loops over rows of A, and may resize after processing
1992 // each row. Chris Siefert says that this reflects experience in
1993 // ML; for the non-threaded case, ML found it faster to spend less
1994 // effort on estimation and risk an occasional reallocation.
1995 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1996 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"), m + 1);
1997 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"), CSR_alloc);
1998 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"), CSR_alloc);
1999
2000 // mfh 27 Sep 2016: The c_status array is an implementation detail
2001 // of the local sparse matrix-matrix multiply routine.
2002
2003 // The status array will contain the index into colind where this entry was last deposited.
2004 // c_status[i] < CSR_ip - not in the row yet
2005 // c_status[i] >= CSR_ip - this is the entry where you can find the data
2006 // We start with this filled with INVALID's indicating that there are no entries yet.
2007 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2008 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2009 std::vector<size_t> c_status(n, ST_INVALID);
2010
2011 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2012 // routine. The routine computes C := A * (B_local + B_remote).
2013 //
2014 // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2015 // you whether the corresponding row of B belongs to B_local
2016 // ("orig") or B_remote ("Import").
2017
2018 // For each row of A/C
2019 size_t CSR_ip = 0, OLD_ip = 0;
2020 for (size_t i = 0; i < m; i++) {
2021 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2022 // on the calling process.
2023 Crowptr[i] = CSR_ip;
2024
2025 // mfh 27 Sep 2016: For each entry of A in the current row of A
2026 for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
2027 LO Aik = Acolind[k]; // local column index of current entry of A
2028 const SC Aval = Avals[k]; // value of current entry of A
2029 if (Aval == SC_ZERO && skipExplicitZero)
2030 continue; // skip explicitly stored zero values in A
2031
2032 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2033 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2034 // corresponding to the current entry of A is populated, then
2035 // the corresponding row of B is in B_local (i.e., it lives on
2036 // the calling process).
2037
2038 // Local matrix
2039 size_t Bk = static_cast<size_t>(targetMapToOrigRow[Aik]);
2040
2041 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
2042 for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
2043 LO Bkj = Bcolind[j];
2044 LO Cij = Bcol2Ccol[Bkj];
2045
2046 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2047 // New entry
2048 c_status[Cij] = CSR_ip;
2049 Ccolind[CSR_ip] = Cij;
2050 Cvals[CSR_ip] = Aval * Bvals[j];
2051 CSR_ip++;
2052
2053 } else {
2054 Cvals[c_status[Cij]] += Aval * Bvals[j];
2055 }
2056 }
2057
2058 } else {
2059 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2060 // corresponding to the current entry of A NOT populated (has
2061 // a flag "invalid" value), then the corresponding row of B is
2062 // in B_local (i.e., it lives on the calling process).
2063
2064 // Remote matrix
2065 size_t Ik = static_cast<size_t>(targetMapToImportRow[Aik]);
2066 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
2067 LO Ikj = Icolind[j];
2068 LO Cij = Icol2Ccol[Ikj];
2069
2070 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2071 // New entry
2072 c_status[Cij] = CSR_ip;
2073 Ccolind[CSR_ip] = Cij;
2074 Cvals[CSR_ip] = Aval * Ivals[j];
2075 CSR_ip++;
2076 } else {
2077 Cvals[c_status[Cij]] += Aval * Ivals[j];
2078 }
2079 }
2080 }
2081 }
2082
2083 // Resize for next pass if needed
2084 if (i + 1 < m && CSR_ip + std::min(n, (Arowptr[i + 2] - Arowptr[i + 1]) * b_max_nnz_per_row) > CSR_alloc) {
2085 CSR_alloc *= 2;
2086 Kokkos::resize(Ccolind, CSR_alloc);
2087 Kokkos::resize(Cvals, CSR_alloc);
2088 }
2089 OLD_ip = CSR_ip;
2090 }
2091
2092 Crowptr[m] = CSR_ip;
2093
2094 // Downward resize
2095 Kokkos::resize(Ccolind, CSR_ip);
2096 Kokkos::resize(Cvals, CSR_ip);
2097
2098 {
2099 Tpetra::Details::ProfilingRegion MM3("TpetraExt: MMM: Newmatrix Final Sort");
2100
2101 // Final sort & set of CRS arrays
2102 if (params.is_null() || params->get("sort entries", true)) {
2103 // Tpetra's serial SpGEMM results in almost sorted matrices. Use shell sort.
2104 Import_Util::sortCrsEntries(Crowptr, Ccolind, Cvals);
2105 }
2106 C.setAllValues(Crowptr, Ccolind, Cvals);
2107 }
2108
2109 Tpetra::Details::ProfilingRegion MM4("TpetraExt: MMM: Newmatrix ESCC");
2110 {
2111 // Final FillComplete
2112 //
2113 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2114 // Import (from domain Map to column Map) construction (which costs
2115 // lots of communication) by taking the previously constructed
2116 // Import object. We should be able to do this without interfering
2117 // with the implementation of the local part of sparse matrix-matrix
2118 // multply above.
2119 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2120 labelList->set("Timer Label", label);
2121 if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
2122 RCP<const Export<LO, GO, NO>> dummyExport;
2123 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
2124 }
2125}
2126/*********************************************************************************************************/
2127// Kernel method for computing the local portion of C = A*B (reuse)
2128template <class Scalar,
2129 class LocalOrdinal,
2130 class GlobalOrdinal,
2131 class Node>
2132void mult_A_B_reuse(
2133 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2134 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2135 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2136 const std::string& label,
2137 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2138 using Teuchos::Array;
2139 using Teuchos::ArrayRCP;
2140 using Teuchos::ArrayView;
2141 using Teuchos::RCP;
2142 using Teuchos::rcp;
2143
2144 // Tpetra typedefs
2145 typedef LocalOrdinal LO;
2146 typedef GlobalOrdinal GO;
2147 typedef Node NO;
2148 typedef Import<LO, GO, NO> import_type;
2149 typedef Map<LO, GO, NO> map_type;
2150
2151 // Kokkos typedefs
2152 typedef typename map_type::local_map_type local_map_type;
2154 typedef typename KCRS::StaticCrsGraphType graph_t;
2155 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2156 typedef typename NO::execution_space execution_space;
2157 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2158 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2159
2160 Tpetra::Details::ProfilingRegion MM("TpetraExt: MMM: Reuse Cmap");
2161 (void)label;
2162
2163 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2164
2165 // Grab all the maps
2166 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2167 RCP<const map_type> Ccolmap = C.getColMap();
2168 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2169 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2170 local_map_type Irowmap_local;
2171 if (!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2172 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2173 local_map_type Icolmap_local;
2174 if (!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2175 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2176
2177 // Build the final importer / column map, hash table lookups for C
2178 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"), Bview.colMap->getLocalNumElements()), Icol2Ccol;
2179 {
2180 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2181 // So, column map of C may be a strict subset of the column map of B
2182 Kokkos::parallel_for(
2183 range_type(0, Bview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
2184 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2185 });
2186
2187 if (!Bview.importMatrix.is_null()) {
2188 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2189 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2190
2191 Kokkos::resize(Icol2Ccol, Bview.importMatrix->getColMap()->getLocalNumElements());
2192 Kokkos::parallel_for(
2193 range_type(0, Bview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
2194 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2195 });
2196 }
2197 }
2198
2199 // Run through all the hash table lookups once and for all
2200 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
2201 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
2202 Kokkos::parallel_for(
2203 range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
2204 GO aidx = Acolmap_local.getGlobalElement(i);
2205 LO B_LID = Browmap_local.getLocalElement(aidx);
2206 if (B_LID != LO_INVALID) {
2207 targetMapToOrigRow(i) = B_LID;
2208 targetMapToImportRow(i) = LO_INVALID;
2209 } else {
2210 LO I_LID = Irowmap_local.getLocalElement(aidx);
2211 targetMapToOrigRow(i) = LO_INVALID;
2212 targetMapToImportRow(i) = I_LID;
2213 }
2214 });
2215
2216 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2217 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2218 KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview, Bview, targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
2219}
2220
2221/*********************************************************************************************************/
2222template <class Scalar,
2223 class LocalOrdinal,
2224 class GlobalOrdinal,
2225 class Node,
2226 class LocalOrdinalViewType>
2227void KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2228 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2229 const LocalOrdinalViewType& targetMapToOrigRow,
2230 const LocalOrdinalViewType& targetMapToImportRow,
2231 const LocalOrdinalViewType& Bcol2Ccol,
2232 const LocalOrdinalViewType& Icol2Ccol,
2233 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2234 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node>> /* Cimport */,
2235 const std::string& label,
2236 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2237 using Teuchos::RCP;
2238 using Teuchos::rcp;
2239
2240 // Lots and lots of typedefs
2241 typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
2242 typedef typename KCRS::StaticCrsGraphType graph_t;
2243 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2244 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2245 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2246
2247 typedef Scalar SC;
2248 typedef LocalOrdinal LO;
2249 typedef GlobalOrdinal GO;
2250 typedef Node NO;
2251 typedef Map<LO, GO, NO> map_type;
2252 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2253 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2254 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2255
2256 // By default, if A*B results in an entry that is not supported by the graph of C, we throw.
2257 // This option allows to override this behavior and silently ignores such entries.
2258 bool throwOnInsert = true;
2259 if (!params.is_null() && params->isType<bool>("MM Throw For Non-Existent Entries"))
2260 throwOnInsert = params->get<bool>("MM Throw For Non-Existent Entries");
2261
2262 Tpetra::Details::ProfilingRegion MM("TpetraExt: MMM: Reuse SerialCore");
2263 (void)label;
2264
2265 // Sizes
2266 RCP<const map_type> Ccolmap = C.getColMap();
2267 size_t m = Aview.origMatrix->getLocalNumRows();
2268 size_t n = Ccolmap->getLocalNumElements();
2269
2270 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2271 const KCRS Amat = Aview.origMatrix->getLocalMatrixHost();
2272 const KCRS Bmat = Bview.origMatrix->getLocalMatrixHost();
2273 const KCRS Cmat = C.getLocalMatrixHost();
2274
2275 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2276 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2277 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2278 scalar_view_t Cvals = Cmat.values;
2279
2280 c_lno_view_t Irowptr;
2281 lno_nnz_view_t Icolind;
2282 scalar_view_t Ivals;
2283 if (!Bview.importMatrix.is_null()) {
2284 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2285 Irowptr = lclB.graph.row_map;
2286 Icolind = lclB.graph.entries;
2287 Ivals = lclB.values;
2288 }
2289
2290 // Classic csr assembly (low memory edition)
2291 // mfh 27 Sep 2016: The c_status array is an implementation detail
2292 // of the local sparse matrix-matrix multiply routine.
2293
2294 // The status array will contain the index into colind where this entry was last deposited.
2295 // c_status[i] < CSR_ip - not in the row yet
2296 // c_status[i] >= CSR_ip - this is the entry where you can find the data
2297 // We start with this filled with INVALID's indicating that there are no entries yet.
2298 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2299 std::vector<size_t> c_status(n, ST_INVALID);
2300
2301 // For each row of A/C
2302 size_t CSR_ip = 0, OLD_ip = 0;
2303 for (size_t i = 0; i < m; i++) {
2304 // First fill the c_status array w/ locations where we're allowed to
2305 // generate nonzeros for this row
2306 OLD_ip = Crowptr[i];
2307 CSR_ip = Crowptr[i + 1];
2308 for (size_t k = OLD_ip; k < CSR_ip; k++) {
2309 c_status[Ccolind[k]] = k;
2310
2311 // Reset values in the row of C
2312 Cvals[k] = SC_ZERO;
2313 }
2314
2315 for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
2316 LO Aik = Acolind[k];
2317 const SC Aval = Avals[k];
2318 if (Aval == SC_ZERO)
2319 continue;
2320
2321 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2322 // Local matrix
2323 size_t Bk = static_cast<size_t>(targetMapToOrigRow[Aik]);
2324
2325 for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
2326 LO Bkj = Bcolind[j];
2327 LO Cij = Bcol2Ccol[Bkj];
2328
2329 const bool badInsert = c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip;
2330 if (!badInsert)
2331 Cvals[c_status[Cij]] += Aval * Bvals[j];
2332 else if (throwOnInsert)
2333 TEUCHOS_TEST_FOR_EXCEPTION(badInsert,
2334 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
2335 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2336 }
2337
2338 } else {
2339 // Remote matrix
2340 size_t Ik = static_cast<size_t>(targetMapToImportRow[Aik]);
2341 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
2342 LO Ikj = Icolind[j];
2343 LO Cij = Icol2Ccol[Ikj];
2344
2345 const bool badInsert = c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip;
2346 if (!badInsert)
2347 Cvals[c_status[Cij]] += Aval * Ivals[j];
2348 else if (throwOnInsert)
2349 TEUCHOS_TEST_FOR_EXCEPTION(badInsert,
2350 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
2351 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2352 }
2353 }
2354 }
2355 }
2356
2357 {
2358 Tpetra::Details::ProfilingRegion MM3("TpetraExt: MMM: Reuse ESFC");
2359 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2360 }
2361}
2362
2363/*********************************************************************************************************/
2364// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2365template <class Scalar,
2366 class LocalOrdinal,
2367 class GlobalOrdinal,
2368 class Node>
2369void jacobi_A_B_newmatrix(
2370 typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
2371 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
2372 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2373 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2374 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2375 const std::string& label,
2376 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2377 using Teuchos::Array;
2378 using Teuchos::ArrayRCP;
2379 using Teuchos::ArrayView;
2380 using Teuchos::RCP;
2381 using Teuchos::rcp;
2382 // typedef Scalar SC;
2383 typedef LocalOrdinal LO;
2384 typedef GlobalOrdinal GO;
2385 typedef Node NO;
2386
2387 typedef Import<LO, GO, NO> import_type;
2388 typedef Map<LO, GO, NO> map_type;
2389 typedef typename map_type::local_map_type local_map_type;
2390
2391 // All of the Kokkos typedefs
2393 typedef typename KCRS::StaticCrsGraphType graph_t;
2394 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2395 typedef typename NO::execution_space execution_space;
2396 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2397 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2398
2399 Tpetra::Details::ProfilingRegion MM3("TpetraExt: Jacobi: M5 Cmap");
2400
2401 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2402
2403 // Build the final importer / column map, hash table lookups for C
2404 RCP<const import_type> Cimport;
2405 RCP<const map_type> Ccolmap;
2406 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2407 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ? Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2408 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2409 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2410 local_map_type Irowmap_local;
2411 if (!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2412 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2413 local_map_type Icolmap_local;
2414 if (!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2415
2416 // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2417 // indices of B, to local column indices of C. (B and C have the
2418 // same number of columns.) The kernel uses this, instead of
2419 // copying the entire input matrix B and converting its column
2420 // indices to those of C.
2421 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"), Bview.colMap->getLocalNumElements()), Icol2Ccol;
2422
2423 if (Bview.importMatrix.is_null()) {
2424 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2425 Cimport = Bimport;
2426 Ccolmap = Bview.colMap;
2427 // Bcol2Ccol is trivial
2428 // Bcol2Ccol is trivial
2429
2430 Kokkos::RangePolicy<execution_space, LO> range(0, static_cast<LO>(Bview.colMap->getLocalNumElements()));
2431 Kokkos::parallel_for(
2432 range, KOKKOS_LAMBDA(const size_t i) {
2433 Bcol2Ccol(i) = static_cast<LO>(i);
2434 });
2435 } else {
2436 // mfh 27 Sep 2016: B has "remotes," so we need to build the
2437 // column Map of C, as well as C's Import object (from its domain
2438 // Map to its column Map). C's column Map is the union of the
2439 // column Maps of (the local part of) B, and the "remote" part of
2440 // B. Ditto for the Import. We have optimized this "setUnion"
2441 // operation on Import objects and Maps.
2442
2443 // Choose the right variant of setUnion
2444 if (!Bimport.is_null() && !Iimport.is_null()) {
2445 Cimport = Bimport->setUnion(*Iimport);
2446 Ccolmap = Cimport->getTargetMap();
2447
2448 } else if (!Bimport.is_null() && Iimport.is_null()) {
2449 Cimport = Bimport->setUnion();
2450
2451 } else if (Bimport.is_null() && !Iimport.is_null()) {
2452 Cimport = Iimport->setUnion();
2453
2454 } else
2455 throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2456
2457 Ccolmap = Cimport->getTargetMap();
2458
2459 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2460 std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2461
2462 // NOTE: This is not efficient and should be folded into setUnion
2463 //
2464 // mfh 27 Sep 2016: What the above comment means, is that the
2465 // setUnion operation on Import objects could also compute these
2466 // local index - to - local index look-up tables.
2467 Kokkos::resize(Icol2Ccol, Bview.importMatrix->getColMap()->getLocalNumElements());
2468 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2469 Kokkos::parallel_for(
2470 range_type(0, Bview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
2471 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2472 });
2473 Kokkos::parallel_for(
2474 range_type(0, Bview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
2475 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2476 });
2477 }
2478
2479 // Replace the column map
2480 //
2481 // mfh 27 Sep 2016: We do this because C was originally created
2482 // without a column Map. Now we have its column Map.
2483 C.replaceColMap(Ccolmap);
2484
2485 // mfh 27 Sep 2016: Construct tables that map from local column
2486 // indices of A, to local row indices of either B_local (the locally
2487 // owned part of B), or B_remote (the "imported" remote part of B).
2488 //
2489 // For column index Aik in row i of A, if the corresponding row of B
2490 // exists in the local part of B ("orig") (which I'll call B_local),
2491 // then targetMapToOrigRow[Aik] is the local index of that row of B.
2492 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2493 //
2494 // For column index Aik in row i of A, if the corresponding row of B
2495 // exists in the remote part of B ("Import") (which I'll call
2496 // B_remote), then targetMapToImportRow[Aik] is the local index of
2497 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2498 // (a flag value).
2499
2500 // Run through all the hash table lookups once and for all
2501 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
2502 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
2503 Kokkos::parallel_for(
2504 range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
2505 GO aidx = Acolmap_local.getGlobalElement(i);
2506 LO B_LID = Browmap_local.getLocalElement(aidx);
2507 if (B_LID != LO_INVALID) {
2508 targetMapToOrigRow(i) = B_LID;
2509 targetMapToImportRow(i) = LO_INVALID;
2510 } else {
2511 LO I_LID = Irowmap_local.getLocalElement(aidx);
2512 targetMapToOrigRow(i) = LO_INVALID;
2513 targetMapToImportRow(i) = I_LID;
2514 }
2515 });
2516
2517 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2518 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2519 KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega, Dinv, Aview, Bview, targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
2520}
2521
2522/*********************************************************************************************************/
2523// Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2524// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2525
2526template <class Scalar,
2527 class LocalOrdinal,
2528 class GlobalOrdinal,
2529 class Node,
2530 class LocalOrdinalViewType>
2531void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
2532 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
2533 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2534 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2535 const LocalOrdinalViewType& targetMapToOrigRow,
2536 const LocalOrdinalViewType& targetMapToImportRow,
2537 const LocalOrdinalViewType& Bcol2Ccol,
2538 const LocalOrdinalViewType& Icol2Ccol,
2539 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2540 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node>> Cimport,
2541 const std::string& label,
2542 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2543 Tpetra::Details::ProfilingRegion MM("TpetraExt: Jacobi: Newmatrix SerialCore");
2544
2545 using Teuchos::Array;
2546 using Teuchos::ArrayRCP;
2547 using Teuchos::ArrayView;
2548 using Teuchos::RCP;
2549 using Teuchos::rcp;
2550
2551 // Lots and lots of typedefs
2552 typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
2553 typedef typename KCRS::StaticCrsGraphType graph_t;
2554 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2555 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2556 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2557 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2558
2559 // Jacobi-specific
2560 typedef typename scalar_view_t::memory_space scalar_memory_space;
2561
2562 typedef Scalar SC;
2563 typedef LocalOrdinal LO;
2564 typedef GlobalOrdinal GO;
2565 typedef Node NO;
2566
2567 typedef Map<LO, GO, NO> map_type;
2568 size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2569 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2570
2571 // Sizes
2572 RCP<const map_type> Ccolmap = C.getColMap();
2573 size_t m = Aview.origMatrix->getLocalNumRows();
2574 size_t n = Ccolmap->getLocalNumElements();
2575 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2576
2577 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2578 const KCRS Amat = Aview.origMatrix->getLocalMatrixHost();
2579 const KCRS Bmat = Bview.origMatrix->getLocalMatrixHost();
2580
2581 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2582 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2583 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2584
2585 c_lno_view_t Irowptr;
2586 lno_nnz_view_t Icolind;
2587 scalar_view_t Ivals;
2588 if (!Bview.importMatrix.is_null()) {
2589 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2590 Irowptr = lclB.graph.row_map;
2591 Icolind = lclB.graph.entries;
2592 Ivals = lclB.values;
2593 b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
2594 }
2595
2596 // Jacobi-specific inner stuff
2597 auto Dvals =
2598 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2599
2600 // Teuchos::ArrayView::operator[].
2601 // The status array will contain the index into colind where this entry was last deposited.
2602 // c_status[i] < CSR_ip - not in the row yet.
2603 // c_status[i] >= CSR_ip, this is the entry where you can find the data
2604 // We start with this filled with INVALID's indicating that there are no entries yet.
2605 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2606 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2607 Array<size_t> c_status(n, ST_INVALID);
2608
2609 // Classic csr assembly (low memory edition)
2610 //
2611 // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2612 // The method loops over rows of A, and may resize after processing
2613 // each row. Chris Siefert says that this reflects experience in
2614 // ML; for the non-threaded case, ML found it faster to spend less
2615 // effort on estimation and risk an occasional reallocation.
2616 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2617 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"), m + 1);
2618 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"), CSR_alloc);
2619 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"), CSR_alloc);
2620 size_t CSR_ip = 0, OLD_ip = 0;
2621
2622 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2623
2624 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2625 // routine. The routine computes
2626 //
2627 // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2628 //
2629 // This corresponds to one sweep of (weighted) Jacobi.
2630 //
2631 // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2632 // you whether the corresponding row of B belongs to B_local
2633 // ("orig") or B_remote ("Import").
2634
2635 // For each row of A/C
2636 for (size_t i = 0; i < m; i++) {
2637 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2638 // on the calling process.
2639 Crowptr[i] = CSR_ip;
2640 SC minusOmegaDval = -omega * Dvals(i, 0);
2641
2642 // Entries of B
2643 for (size_t j = Browptr[i]; j < Browptr[i + 1]; j++) {
2644 Scalar Bval = Bvals[j];
2645 if (Bval == SC_ZERO)
2646 continue;
2647 LO Bij = Bcolind[j];
2648 LO Cij = Bcol2Ccol[Bij];
2649
2650 // Assume no repeated entries in B
2651 c_status[Cij] = CSR_ip;
2652 Ccolind[CSR_ip] = Cij;
2653 Cvals[CSR_ip] = Bvals[j];
2654 CSR_ip++;
2655 }
2656
2657 // Entries of -omega * Dinv * A * B
2658 for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
2659 LO Aik = Acolind[k];
2660 const SC Aval = Avals[k];
2661 if (Aval == SC_ZERO)
2662 continue;
2663
2664 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2665 // Local matrix
2666 size_t Bk = static_cast<size_t>(targetMapToOrigRow[Aik]);
2667
2668 for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
2669 LO Bkj = Bcolind[j];
2670 LO Cij = Bcol2Ccol[Bkj];
2671
2672 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2673 // New entry
2674 c_status[Cij] = CSR_ip;
2675 Ccolind[CSR_ip] = Cij;
2676 Cvals[CSR_ip] = minusOmegaDval * Aval * Bvals[j];
2677 CSR_ip++;
2678
2679 } else {
2680 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
2681 }
2682 }
2683
2684 } else {
2685 // Remote matrix
2686 size_t Ik = static_cast<size_t>(targetMapToImportRow[Aik]);
2687 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
2688 LO Ikj = Icolind[j];
2689 LO Cij = Icol2Ccol[Ikj];
2690
2691 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2692 // New entry
2693 c_status[Cij] = CSR_ip;
2694 Ccolind[CSR_ip] = Cij;
2695 Cvals[CSR_ip] = minusOmegaDval * Aval * Ivals[j];
2696 CSR_ip++;
2697 } else {
2698 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
2699 }
2700 }
2701 }
2702 }
2703
2704 // Resize for next pass if needed
2705 if (i + 1 < m && CSR_ip + std::min(n, (Arowptr[i + 2] - Arowptr[i + 1] + 1) * b_max_nnz_per_row) > CSR_alloc) {
2706 CSR_alloc *= 2;
2707 Kokkos::resize(Ccolind, CSR_alloc);
2708 Kokkos::resize(Cvals, CSR_alloc);
2709 }
2710 OLD_ip = CSR_ip;
2711 }
2712 Crowptr[m] = CSR_ip;
2713
2714 // Downward resize
2715 Kokkos::resize(Ccolind, CSR_ip);
2716 Kokkos::resize(Cvals, CSR_ip);
2717
2718 {
2719 Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: Newmatrix Final Sort");
2720
2721 // Replace the column map
2722 //
2723 // mfh 27 Sep 2016: We do this because C was originally created
2724 // without a column Map. Now we have its column Map.
2725 C.replaceColMap(Ccolmap);
2726
2727 // Final sort & set of CRS arrays
2728 //
2729 // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2730 // matrix-matrix multiply routine sort the entries for us?
2731 // Final sort & set of CRS arrays
2732 if (params.is_null() || params->get("sort entries", true)) {
2733 // Tpetra's serial SpGEMM results in almost sorted matrices. Use shell sort.
2734 Import_Util::sortCrsEntries(Crowptr, Ccolind, Cvals);
2735 }
2736 C.setAllValues(Crowptr, Ccolind, Cvals);
2737 }
2738 {
2739 Tpetra::Details::ProfilingRegion MM3("TpetraExt: Jacobi: Newmatrix ESFC");
2740
2741 // Final FillComplete
2742 //
2743 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2744 // Import (from domain Map to column Map) construction (which costs
2745 // lots of communication) by taking the previously constructed
2746 // Import object. We should be able to do this without interfering
2747 // with the implementation of the local part of sparse matrix-matrix
2748 // multply above
2749 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2750 labelList->set("Timer Label", label);
2751 if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
2752 RCP<const Export<LO, GO, NO>> dummyExport;
2753 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
2754 }
2755}
2756
2757/*********************************************************************************************************/
2758// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2759template <class Scalar,
2760 class LocalOrdinal,
2761 class GlobalOrdinal,
2762 class Node>
2763void jacobi_A_B_reuse(
2764 typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
2765 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
2766 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2767 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2768 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2769 const std::string& label,
2770 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2771 using Teuchos::Array;
2772 using Teuchos::ArrayRCP;
2773 using Teuchos::ArrayView;
2774 using Teuchos::RCP;
2775 using Teuchos::rcp;
2776
2777 typedef LocalOrdinal LO;
2778 typedef GlobalOrdinal GO;
2779 typedef Node NO;
2780
2781 typedef Import<LO, GO, NO> import_type;
2782 typedef Map<LO, GO, NO> map_type;
2783
2784 // Kokkos typedefs
2785 typedef typename map_type::local_map_type local_map_type;
2787 typedef typename KCRS::StaticCrsGraphType graph_t;
2788 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2789 typedef typename NO::execution_space execution_space;
2790 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2791 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2792
2793 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2794 lo_view_t Bcol2Ccol, Icol2Ccol;
2795 lo_view_t targetMapToOrigRow;
2796 lo_view_t targetMapToImportRow;
2797 {
2798 Tpetra::Details::ProfilingRegion MM("TpetraExt: Jacobi: Reuse Cmap");
2799
2800 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2801
2802 // Grab all the maps
2803 RCP<const map_type> Ccolmap = C.getColMap();
2804 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2805 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2806 local_map_type Irowmap_local;
2807 if (!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2808 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2809 local_map_type Icolmap_local;
2810 if (!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2811 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2812
2813 // Build the final importer / column map, hash table lookups for C
2814 Bcol2Ccol = lo_view_t(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"), Bview.colMap->getLocalNumElements());
2815 {
2816 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2817 // So, column map of C may be a strict subset of the column map of B
2818 Kokkos::parallel_for(
2819 range_type(0, Bview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
2820 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2821 });
2822
2823 if (!Bview.importMatrix.is_null()) {
2824 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2825 std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2826
2827 Kokkos::resize(Icol2Ccol, Bview.importMatrix->getColMap()->getLocalNumElements());
2828 Kokkos::parallel_for(
2829 range_type(0, Bview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
2830 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2831 });
2832 }
2833 }
2834
2835 // Run through all the hash table lookups once and for all
2836 targetMapToOrigRow = lo_view_t(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
2837 targetMapToImportRow = lo_view_t(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
2838 Kokkos::parallel_for(
2839 range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
2840 GO aidx = Acolmap_local.getGlobalElement(i);
2841 LO B_LID = Browmap_local.getLocalElement(aidx);
2842 if (B_LID != LO_INVALID) {
2843 targetMapToOrigRow(i) = B_LID;
2844 targetMapToImportRow(i) = LO_INVALID;
2845 } else {
2846 LO I_LID = Irowmap_local.getLocalElement(aidx);
2847 targetMapToOrigRow(i) = LO_INVALID;
2848 targetMapToImportRow(i) = I_LID;
2849 }
2850 });
2851 }
2852
2853 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2854 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2855 KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega, Dinv, Aview, Bview, targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
2856}
2857
2858/*********************************************************************************************************/
2859template <class Scalar,
2860 class LocalOrdinal,
2861 class GlobalOrdinal,
2862 class Node,
2863 class LocalOrdinalViewType>
2864void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
2865 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
2866 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2867 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2868 const LocalOrdinalViewType& targetMapToOrigRow,
2869 const LocalOrdinalViewType& targetMapToImportRow,
2870 const LocalOrdinalViewType& Bcol2Ccol,
2871 const LocalOrdinalViewType& Icol2Ccol,
2872 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2873 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node>> /* Cimport */,
2874 const std::string& label,
2875 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2876 Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: Reuse Serial Core");
2877 (void)label;
2878
2879 using Teuchos::RCP;
2880 using Teuchos::rcp;
2881
2882 // Lots and lots of typedefs
2883 typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
2884 typedef typename KCRS::StaticCrsGraphType graph_t;
2885 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2886 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2887 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2888 typedef typename scalar_view_t::memory_space scalar_memory_space;
2889
2890 typedef Scalar SC;
2891 typedef LocalOrdinal LO;
2892 typedef GlobalOrdinal GO;
2893 typedef Node NO;
2894 typedef Map<LO, GO, NO> map_type;
2895 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2896 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2897 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2898
2899 // Sizes
2900 RCP<const map_type> Ccolmap = C.getColMap();
2901 size_t m = Aview.origMatrix->getLocalNumRows();
2902 size_t n = Ccolmap->getLocalNumElements();
2903
2904 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2905 const KCRS Amat = Aview.origMatrix->getLocalMatrixHost();
2906 const KCRS Bmat = Bview.origMatrix->getLocalMatrixHost();
2907 const KCRS Cmat = C.getLocalMatrixHost();
2908
2909 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2910 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2911 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2912 scalar_view_t Cvals = Cmat.values;
2913
2914 c_lno_view_t Irowptr;
2915 lno_nnz_view_t Icolind;
2916 scalar_view_t Ivals;
2917 if (!Bview.importMatrix.is_null()) {
2918 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2919 Irowptr = lclB.graph.row_map;
2920 Icolind = lclB.graph.entries;
2921 Ivals = lclB.values;
2922 }
2923
2924 // Jacobi-specific inner stuff
2925 auto Dvals =
2926 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2927
2928 // The status array will contain the index into colind where this entry was last deposited.
2929 // c_status[i] < CSR_ip - not in the row yet
2930 // c_status[i] >= CSR_ip - this is the entry where you can find the data
2931 // We start with this filled with INVALID's indicating that there are no entries yet.
2932 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2933 std::vector<size_t> c_status(n, ST_INVALID);
2934
2935 // For each row of A/C
2936 size_t CSR_ip = 0, OLD_ip = 0;
2937 for (size_t i = 0; i < m; i++) {
2938 // First fill the c_status array w/ locations where we're allowed to
2939 // generate nonzeros for this row
2940 OLD_ip = Crowptr[i];
2941 CSR_ip = Crowptr[i + 1];
2942 for (size_t k = OLD_ip; k < CSR_ip; k++) {
2943 c_status[Ccolind[k]] = k;
2944
2945 // Reset values in the row of C
2946 Cvals[k] = SC_ZERO;
2947 }
2948
2949 SC minusOmegaDval = -omega * Dvals(i, 0);
2950
2951 // Entries of B
2952 for (size_t j = Browptr[i]; j < Browptr[i + 1]; j++) {
2953 Scalar Bval = Bvals[j];
2954 if (Bval == SC_ZERO)
2955 continue;
2956 LO Bij = Bcolind[j];
2957 LO Cij = Bcol2Ccol[Bij];
2958
2959 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2960 std::runtime_error, "Trying to insert a new entry into a static graph");
2961
2962 Cvals[c_status[Cij]] = Bvals[j];
2963 }
2964
2965 // Entries of -omega * Dinv * A * B
2966 for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
2967 LO Aik = Acolind[k];
2968 const SC Aval = Avals[k];
2969 if (Aval == SC_ZERO)
2970 continue;
2971
2972 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2973 // Local matrix
2974 size_t Bk = static_cast<size_t>(targetMapToOrigRow[Aik]);
2975
2976 for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
2977 LO Bkj = Bcolind[j];
2978 LO Cij = Bcol2Ccol[Bkj];
2979
2980 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2981 std::runtime_error, "Trying to insert a new entry into a static graph");
2982
2983 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
2984 }
2985
2986 } else {
2987 // Remote matrix
2988 size_t Ik = static_cast<size_t>(targetMapToImportRow[Aik]);
2989 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
2990 LO Ikj = Icolind[j];
2991 LO Cij = Icol2Ccol[Ikj];
2992
2993 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2994 std::runtime_error, "Trying to insert a new entry into a static graph");
2995
2996 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
2997 }
2998 }
2999 }
3000 }
3001
3002 {
3003 Tpetra::Details::ProfilingRegion MM3("TpetraExt: Jacobi: Reuse ESFC");
3004 C.fillComplete(C.getDomainMap(), C.getRangeMap());
3005 }
3006}
3007
3008/*********************************************************************************************************/
3009template <class Scalar,
3010 class LocalOrdinal,
3011 class GlobalOrdinal,
3012 class Node>
3013void import_and_extract_views(
3014 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
3015 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>> targetMap,
3016 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3017 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node>> prototypeImporter,
3018 bool userAssertsThereAreNoRemotes,
3019 const std::string& label,
3020 const Teuchos::RCP<Teuchos::ParameterList>& params) {
3021 using Teuchos::Array;
3022 using Teuchos::ArrayView;
3023 using Teuchos::null;
3024 using Teuchos::RCP;
3025 using Teuchos::rcp;
3026
3027 typedef Scalar SC;
3028 typedef LocalOrdinal LO;
3029 typedef GlobalOrdinal GO;
3030 typedef Node NO;
3031
3032 typedef Map<LO, GO, NO> map_type;
3033 typedef Import<LO, GO, NO> import_type;
3034 typedef CrsMatrix<SC, LO, GO, NO> crs_matrix_type;
3035
3036 RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X Alloc"));
3037
3038 // The goal of this method is to populate the 'Aview' struct with views of the
3039 // rows of A, including all rows that correspond to elements in 'targetMap'.
3040 //
3041 // If targetMap includes local elements that correspond to remotely-owned rows
3042 // of A, then those remotely-owned rows will be imported into
3043 // 'Aview.importMatrix', and views of them will be included in 'Aview'.
3044 Aview.deleteContents();
3045
3046 Aview.origMatrix = rcp(&A, false);
3047 // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3048 Aview.origMatrix->getApplyHelper();
3049 Aview.origRowMap = A.getRowMap();
3050 Aview.rowMap = targetMap;
3051 Aview.colMap = A.getColMap();
3052 Aview.domainMap = A.getDomainMap();
3053 Aview.importColMap = null;
3054 RCP<const map_type> rowMap = A.getRowMap();
3055 const int numProcs = rowMap->getComm()->getSize();
3056
3057 // Short circuit if the user swears there are no remotes (or if we're in serial)
3058 if (userAssertsThereAreNoRemotes || numProcs < 2)
3059 return;
3060
3061 RCP<const import_type> importer;
3062 if (params != null && params->isParameter("importer")) {
3063 importer = params->get<RCP<const import_type>>("importer");
3064
3065 } else {
3066 MM = Teuchos::null;
3067 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X RemoteMap"));
3068
3069 // Mark each row in targetMap as local or remote, and go ahead and get a view
3070 // for the local rows
3071 RCP<const map_type> remoteRowMap;
3072 size_t numRemote = 0;
3073 int mode = 0;
3074 if (!prototypeImporter.is_null() &&
3075 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3076 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3077 // We have a valid prototype importer --- ask it for the remotes
3078 Tpetra::Details::ProfilingRegion MM2("TpetraExt: MMM: I&X RemoteMap: Mode1");
3079
3080 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3081 numRemote = prototypeImporter->getNumRemoteIDs();
3082
3083 Array<GO> remoteRows(numRemote);
3084 for (size_t i = 0; i < numRemote; i++)
3085 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3086
3087 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3088 rowMap->getIndexBase(), rowMap->getComm(), params));
3089 mode = 1;
3090
3091 } else if (prototypeImporter.is_null()) {
3092 // No prototype importer --- count the remotes the hard way
3093 Tpetra::Details::ProfilingRegion MM2("TpetraExt: MMM: I&X RemoteMap: Mode2");
3094
3095 ArrayView<const GO> rows = targetMap->getLocalElementList();
3096 size_t numRows = targetMap->getLocalNumElements();
3097
3098 Array<GO> remoteRows(numRows);
3099 for (size_t i = 0; i < numRows; ++i) {
3100 const LO mlid = rowMap->getLocalElement(rows[i]);
3101
3102 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3103 remoteRows[numRemote++] = rows[i];
3104 }
3105 remoteRows.resize(numRemote);
3106 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3107 rowMap->getIndexBase(), rowMap->getComm(), params));
3108 mode = 2;
3109
3110 } else {
3111 // PrototypeImporter is bad. But if we're in serial that's OK.
3112 mode = 3;
3113 }
3114
3115 if (numProcs < 2) {
3116 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3117 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3118 // If only one processor we don't need to import any remote rows, so return.
3119 return;
3120 }
3121
3122 //
3123 // Now we will import the needed remote rows of A, if the remoteRowMap has entries
3124 //
3125 if (!remoteRowMap.is_null() && (remoteRowMap->getGlobalNumElements() > 0)) {
3126 MM = Teuchos::null;
3127 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X Import-2"));
3128
3129 // Create an importer with target-map remoteRowMap and source-map rowMap.
3130 if (mode == 1)
3131 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3132 else if (mode == 2)
3133 importer = rcp(new import_type(rowMap, remoteRowMap));
3134 else
3135 throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
3136 }
3137
3138 if (params != null)
3139 params->set("importer", importer);
3140 }
3141
3142 if (importer != null) {
3143 MM = Teuchos::null;
3144 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X Import-3"));
3145
3146 // Now create a new matrix into which we can import the remote rows of A that we need.
3147 Teuchos::ParameterList labelList;
3148 labelList.set("Timer Label", label);
3149 auto& labelList_subList = labelList.sublist("matrixmatrix: kernel params", false);
3150
3151 bool isMM = true;
3152 bool overrideAllreduce = false;
3153 int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3154 // Minor speedup tweak - avoid computing the global constants
3155 Teuchos::ParameterList params_sublist;
3156 if (!params.is_null()) {
3157 labelList.set("compute global constants", params->get("compute global constants", false));
3158 params_sublist = params->sublist("matrixmatrix: kernel params", false);
3159 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
3160 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
3161 if (mm_optimization_core_count2 < mm_optimization_core_count) mm_optimization_core_count = mm_optimization_core_count2;
3162 isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete", false);
3163 overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck", false);
3164 }
3165 labelList_subList.set("isMatrixMatrix_TransferAndFillComplete", isMM);
3166 labelList_subList.set("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
3167 labelList_subList.set("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
3168
3169 Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3170 A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3171 // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3172 Aview.importMatrix->getApplyHelper();
3173
3174#if 0
3175 // Disabled code for dumping input matrices
3176 static int count=0;
3177 char str[80];
3178 sprintf(str,"import_matrix.%d.dat",count);
3180 count++;
3181#endif
3182
3183#ifdef HAVE_TPETRA_MMM_STATISTICS
3184 printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3185#endif
3186
3187 MM = Teuchos::null;
3188 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X Import-4"));
3189
3190 // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3191 Aview.importColMap = Aview.importMatrix->getColMap();
3192 MM = Teuchos::null;
3193 }
3194}
3195
3196/*********************************************************************************************************/
3197template <class Scalar,
3198 class LocalOrdinal,
3199 class GlobalOrdinal,
3200 class Node>
3201void import_and_extract_views(
3202 const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& M,
3203 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>> targetMap,
3204 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview,
3205 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node>> prototypeImporter,
3206 bool userAssertsThereAreNoRemotes) {
3207 using Teuchos::Array;
3208 using Teuchos::ArrayView;
3209 using Teuchos::null;
3210 using Teuchos::RCP;
3211 using Teuchos::rcp;
3212
3213 typedef Scalar SC;
3214 typedef LocalOrdinal LO;
3215 typedef GlobalOrdinal GO;
3216 typedef Node NO;
3217
3218 typedef Map<LO, GO, NO> map_type;
3219 typedef Import<LO, GO, NO> import_type;
3220 typedef BlockCrsMatrix<SC, LO, GO, NO> blockcrs_matrix_type;
3221
3222 // The goal of this method is to populate the 'Mview' struct with views of the
3223 // rows of M, including all rows that correspond to elements in 'targetMap'.
3224 //
3225 // If targetMap includes local elements that correspond to remotely-owned rows
3226 // of M, then those remotely-owned rows will be imported into
3227 // 'Mview.importMatrix', and views of them will be included in 'Mview'.
3228 Mview.deleteContents();
3229
3230 Mview.origMatrix = rcp(&M, false);
3231 // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3232 Mview.origMatrix->getApplyHelper();
3233 Mview.origRowMap = M.getRowMap();
3234 Mview.rowMap = targetMap;
3235 Mview.colMap = M.getColMap();
3236 Mview.importColMap = null;
3237 RCP<const map_type> rowMap = M.getRowMap();
3238 const int numProcs = rowMap->getComm()->getSize();
3239
3240 // Short circuit if the user swears there are no remotes (or if we're in serial)
3241 if (userAssertsThereAreNoRemotes || numProcs < 2) return;
3242
3243 // Mark each row in targetMap as local or remote, and go ahead and get a view
3244 // for the local rows
3245 RCP<const map_type> remoteRowMap;
3246 size_t numRemote = 0;
3247 int mode = 0;
3248 if (!prototypeImporter.is_null() &&
3249 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3250 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3251 // We have a valid prototype importer --- ask it for the remotes
3252 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3253 numRemote = prototypeImporter->getNumRemoteIDs();
3254
3255 Array<GO> remoteRows(numRemote);
3256 for (size_t i = 0; i < numRemote; i++)
3257 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3258
3259 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3260 rowMap->getIndexBase(), rowMap->getComm()));
3261 mode = 1;
3262
3263 } else if (prototypeImporter.is_null()) {
3264 // No prototype importer --- count the remotes the hard way
3265 ArrayView<const GO> rows = targetMap->getLocalElementList();
3266 size_t numRows = targetMap->getLocalNumElements();
3267
3268 Array<GO> remoteRows(numRows);
3269 for (size_t i = 0; i < numRows; ++i) {
3270 const LO mlid = rowMap->getLocalElement(rows[i]);
3271
3272 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3273 remoteRows[numRemote++] = rows[i];
3274 }
3275 remoteRows.resize(numRemote);
3276 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3277 rowMap->getIndexBase(), rowMap->getComm()));
3278 mode = 2;
3279
3280 } else {
3281 // PrototypeImporter is bad. But if we're in serial that's OK.
3282 mode = 3;
3283 }
3284
3285 if (numProcs < 2) {
3286 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3287 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3288 // If only one processor we don't need to import any remote rows, so return.
3289 return;
3290 }
3291
3292 // Now we will import the needed remote rows of M, if the remoteRowMap has entries
3293
3294 RCP<const import_type> importer;
3295
3296 if (!remoteRowMap.is_null() && (remoteRowMap->getGlobalNumElements() > 0)) {
3297 // Create an importer with target-map remoteRowMap and source-map rowMap.
3298 if (mode == 1)
3299 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3300 else if (mode == 2)
3301 importer = rcp(new import_type(rowMap, remoteRowMap));
3302 else
3303 throw std::runtime_error("prototypeImporter->SourceMap() does not match M.getRowMap()!");
3304 }
3305
3306 if (importer != null) {
3307 // Get import matrix
3308 // TODO: create the int-typed row-pointer here
3309 Mview.importMatrix = Tpetra::importAndFillCompleteBlockCrsMatrix<blockcrs_matrix_type>(rcpFromRef(M), *importer);
3310 // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3311 Mview.importMatrix->getApplyHelper();
3312 // Save the column map of the imported matrix, so that we can convert indices
3313 // back to global for arithmetic later
3314 Mview.importColMap = Mview.importMatrix->getColMap();
3315 }
3316}
3317
3318/*********************************************************************************************************/
3319// This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3320template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
3322merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3323 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3324 const LocalOrdinalViewType& Acol2Brow,
3325 const LocalOrdinalViewType& Acol2Irow,
3326 const LocalOrdinalViewType& Bcol2Ccol,
3327 const LocalOrdinalViewType& Icol2Ccol,
3328 const size_t mergedNodeNumCols) {
3329 using Teuchos::RCP;
3331 typedef typename KCRS::StaticCrsGraphType graph_t;
3332 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3333 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3334 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3335 // Grab the Kokkos::SparseCrsMatrices
3336 const KCRS Ak = Aview.origMatrix->getLocalMatrixDevice();
3337 const KCRS Bk = Bview.origMatrix->getLocalMatrixDevice();
3338
3339 // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3340 if (!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3341 // We do have a Bimport
3342 // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3343 // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3344
3345 KCRS Iks;
3346 if (!Bview.importMatrix.is_null()) Iks = Bview.importMatrix->getLocalMatrixDevice();
3347
3348 size_t merge_numrows = Ak.numCols();
3349
3350 // The last entry of this at least, need to be initialized
3351 lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3352
3353 const LocalOrdinal LO_INVALID = Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3354
3355 // Use a Kokkos::parallel_scan to build the rowptr
3356 typedef typename Node::execution_space execution_space;
3357 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3358 Kokkos::parallel_scan(
3359 "Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type(0, merge_numrows),
3360 KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3361 if (final) Mrowptr(i) = update;
3362 // Get the row count
3363 size_t ct = 0;
3364 if (Acol2Brow(i) != LO_INVALID)
3365 ct = Bk.graph.row_map(Acol2Brow(i) + 1) - Bk.graph.row_map(Acol2Brow(i));
3366 else
3367 ct = Iks.graph.row_map(Acol2Irow(i) + 1) - Iks.graph.row_map(Acol2Irow(i));
3368 update += ct;
3369
3370 if (final && i + 1 == merge_numrows)
3371 Mrowptr(i + 1) = update;
3372 });
3373
3374 // Allocate nnz
3375 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr, merge_numrows);
3376 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"), merge_nnz);
3377 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"), merge_nnz);
3378
3379 // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3380 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3381 Kokkos::parallel_for(
3382 "Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type(0, merge_numrows), KOKKOS_LAMBDA(const size_t i) {
3383 if (Acol2Brow(i) != LO_INVALID) {
3384 size_t row = Acol2Brow(i);
3385 size_t start = Bk.graph.row_map(row);
3386 for (size_t j = Mrowptr(i); j < Mrowptr(i + 1); j++) {
3387 Mvalues(j) = Bk.values(j - Mrowptr(i) + start);
3388 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j - Mrowptr(i) + start));
3389 }
3390 } else {
3391 size_t row = Acol2Irow(i);
3392 size_t start = Iks.graph.row_map(row);
3393 for (size_t j = Mrowptr(i); j < Mrowptr(i + 1); j++) {
3394 Mvalues(j) = Iks.values(j - Mrowptr(i) + start);
3395 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j - Mrowptr(i) + start));
3396 }
3397 }
3398 });
3399
3400 KCRS newmat("CrsMatrix", merge_numrows, mergedNodeNumCols, merge_nnz, Mvalues, Mrowptr, Mcolind);
3401 return newmat;
3402 } else {
3403 // We don't have a Bimport (the easy case)
3404 return Bk;
3405 }
3406} // end merge_matrices
3407
3408/*********************************************************************************************************/
3409// This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3410template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
3411const typename Tpetra::BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type
3412merge_matrices(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3413 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3414 const LocalOrdinalViewType& Acol2Brow,
3415 const LocalOrdinalViewType& Acol2Irow,
3416 const LocalOrdinalViewType& Bcol2Ccol,
3417 const LocalOrdinalViewType& Icol2Ccol,
3418 const size_t mergedNodeNumCols) {
3419 using Teuchos::RCP;
3420 typedef typename Tpetra::BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type KBCRS;
3421 typedef typename KBCRS::StaticCrsGraphType graph_t;
3422 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3423 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3424 typedef typename KBCRS::values_type::non_const_type scalar_view_t;
3425
3426 // Grab the KokkosSparse::BsrMatrix
3427 const KBCRS Ak = Aview.origMatrix->getLocalMatrixDevice();
3428 const KBCRS Bk = Bview.origMatrix->getLocalMatrixDevice();
3429
3430 // We need to do this dance if either (a) We have Bimport or (b) A's colMap is not the same as B's rowMap
3431 if (!Bview.importMatrix.is_null() ||
3432 (Bview.importMatrix.is_null() &&
3433 (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3434 // We do have a Bimport
3435 // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3436 // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3437 KBCRS Iks;
3438 if (!Bview.importMatrix.is_null()) Iks = Bview.importMatrix->getLocalMatrixDevice();
3439 size_t merge_numrows = Ak.numCols();
3440
3441 // The last entry of this at least, need to be initialized
3442 lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3443
3444 const LocalOrdinal LO_INVALID = Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3445
3446 // Use a Kokkos::parallel_scan to build the rowptr
3447 typedef typename Node::execution_space execution_space;
3448 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3449 Kokkos::parallel_scan(
3450 "Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type(0, merge_numrows),
3451 KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3452 if (final) Mrowptr(i) = update;
3453 // Get the row count
3454 size_t ct = 0;
3455 if (Acol2Brow(i) != LO_INVALID)
3456 ct = Bk.graph.row_map(Acol2Brow(i) + 1) - Bk.graph.row_map(Acol2Brow(i));
3457 else
3458 ct = Iks.graph.row_map(Acol2Irow(i) + 1) - Iks.graph.row_map(Acol2Irow(i));
3459 update += ct;
3460
3461 if (final && i + 1 == merge_numrows)
3462 Mrowptr(i + 1) = update;
3463 });
3464
3465 // Allocate nnz
3466 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr, merge_numrows);
3467 const int blocksize = Ak.blockDim();
3468 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"), merge_nnz);
3469 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"), merge_nnz * blocksize * blocksize);
3470
3471 // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3472 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3473 Kokkos::parallel_for(
3474 "Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type(0, merge_numrows), KOKKOS_LAMBDA(const size_t i) {
3475 if (Acol2Brow(i) != LO_INVALID) {
3476 size_t row = Acol2Brow(i);
3477 size_t start = Bk.graph.row_map(row);
3478 for (size_t j = Mrowptr(i); j < Mrowptr(i + 1); j++) {
3479 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j - Mrowptr(i) + start));
3480
3481 for (int b = 0; b < blocksize * blocksize; ++b) {
3482 const int val_indx = j * blocksize * blocksize + b;
3483 const int b_val_indx = (j - Mrowptr(i) + start) * blocksize * blocksize + b;
3484 Mvalues(val_indx) = Bk.values(b_val_indx);
3485 }
3486 }
3487 } else {
3488 size_t row = Acol2Irow(i);
3489 size_t start = Iks.graph.row_map(row);
3490 for (size_t j = Mrowptr(i); j < Mrowptr(i + 1); j++) {
3491 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j - Mrowptr(i) + start));
3492
3493 for (int b = 0; b < blocksize * blocksize; ++b) {
3494 const int val_indx = j * blocksize * blocksize + b;
3495 const int b_val_indx = (j - Mrowptr(i) + start) * blocksize * blocksize + b;
3496 Mvalues(val_indx) = Iks.values(b_val_indx);
3497 }
3498 }
3499 }
3500 });
3501
3502 // Build and return merged KokkosSparse matrix
3503 KBCRS newmat("CrsMatrix", merge_numrows, mergedNodeNumCols, merge_nnz, Mvalues, Mrowptr, Mcolind, blocksize);
3504 return newmat;
3505 } else {
3506 // We don't have a Bimport (the easy case)
3507 return Bk;
3508 }
3509} // end merge_matrices
3510
3511/*********************************************************************************************************/
3512template <typename SC, typename LO, typename GO, typename NO>
3513void AddKernels<SC, LO, GO, NO>::
3514 addSorted(
3515 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3516 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3517 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3518 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3519 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3520 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3521 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3522 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3523 GO numGlobalCols,
3524 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3525 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3526 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds) {
3527 using Teuchos::rcp;
3528 using Teuchos::TimeMonitor;
3529 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3530 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3531 auto nrows = Arowptrs.extent(0) - 1;
3532 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3533 typename AddKern::KKH handle;
3534 handle.create_spadd_handle(true);
3535 auto addHandle = handle.get_spadd_handle();
3536
3537 auto MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: sorted symbolic"));
3538
3539 KokkosSparse::spadd_symbolic(&handle,
3540 nrows, numGlobalCols,
3541 Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3542 // KokkosKernels requires values to be zeroed
3543 Cvals = values_array("C values", addHandle->get_c_nnz());
3544 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3545
3546 MM = Teuchos::null;
3547 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: sorted numeric"));
3548 KokkosSparse::spadd_numeric(&handle,
3549 nrows, numGlobalCols,
3550 Arowptrs, Acolinds, Avals, scalarA,
3551 Browptrs, Bcolinds, Bvals, scalarB,
3552 Crowptrs, Ccolinds, Cvals);
3553}
3554
3555template <typename SC, typename LO, typename GO, typename NO>
3556void AddKernels<SC, LO, GO, NO>::
3557 addUnsorted(
3558 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3559 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3560 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3561 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3562 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3563 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3564 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3565 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3566 GO numGlobalCols,
3567 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3568 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3569 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds) {
3570 using Teuchos::rcp;
3571 using Teuchos::TimeMonitor;
3572 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3573 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3574 auto nrows = Arowptrs.extent(0) - 1;
3575 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3576 typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3577 typename AddKern::KKH handle;
3578 handle.create_spadd_handle(false);
3579 auto addHandle = handle.get_spadd_handle();
3580 auto MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: unsorted symbolic"));
3581
3582 KokkosSparse::spadd_symbolic(&handle,
3583 nrows, numGlobalCols,
3584 Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3585 // Cvals must be zeroed out
3586 Cvals = values_array("C values", addHandle->get_c_nnz());
3587 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3588 MM = Teuchos::null;
3589 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: unsorted numeric"));
3590 KokkosSparse::spadd_numeric(&handle,
3591 nrows, numGlobalCols,
3592 Arowptrs, Acolinds, Avals, scalarA,
3593 Browptrs, Bcolinds, Bvals, scalarB,
3594 Crowptrs, Ccolinds, Cvals);
3595}
3596
3597template <typename GO,
3598 typename LocalIndicesType,
3599 typename GlobalIndicesType,
3600 typename ColMapType>
3601struct ConvertLocalToGlobalFunctor {
3602 ConvertLocalToGlobalFunctor(
3603 const LocalIndicesType& colindsOrig_,
3604 const GlobalIndicesType& colindsConverted_,
3605 const ColMapType& colmap_)
3606 : colindsOrig(colindsOrig_)
3607 , colindsConverted(colindsConverted_)
3608 , colmap(colmap_) {}
3609 KOKKOS_INLINE_FUNCTION void
3610 operator()(const GO i) const {
3611 colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3612 }
3613 LocalIndicesType colindsOrig;
3614 GlobalIndicesType colindsConverted;
3615 ColMapType colmap;
3616};
3617
3618template <typename SC, typename LO, typename GO, typename NO>
3619void MMdetails::AddKernels<SC, LO, GO, NO>::
3620 convertToGlobalAndAdd(
3621 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS A,
3622 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3623 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS B,
3624 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3625 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3626 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3627 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3628 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3629 typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds) {
3630 using Teuchos::rcp;
3631 using Teuchos::TimeMonitor;
3632 // Need to use a different KokkosKernelsHandle type than other versions,
3633 // since the ordinals are now GO
3634 using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3635 typename NO::execution_space, typename NO::memory_space, typename NO::memory_space>;
3636
3637 const values_array Avals = A.values;
3638 const values_array Bvals = B.values;
3639 const col_inds_array Acolinds = A.graph.entries;
3640 const col_inds_array Bcolinds = B.graph.entries;
3641 auto Arowptrs = A.graph.row_map;
3642 auto Browptrs = B.graph.row_map;
3643 global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("A colinds (converted)"), Acolinds.extent(0));
3644 global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("B colinds (converted)"), Bcolinds.extent(0));
3645
3646 auto MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: diff col map kernel: column map conversion"));
3647
3648 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3649 Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3650 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3651 Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3652 KKH_GO handle;
3653 handle.create_spadd_handle(false);
3654 auto addHandle = handle.get_spadd_handle();
3655 MM = Teuchos::null;
3656 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: diff col map kernel: unsorted symbolic"));
3657 auto nrows = Arowptrs.extent(0) - 1;
3658 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3659 KokkosSparse::spadd_symbolic(&handle,
3660 nrows, A.numCols(),
3661 Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3662 Cvals = values_array("C values", addHandle->get_c_nnz());
3663 Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3664
3665 MM = Teuchos::null;
3666 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: diff col map kernel: unsorted numeric"));
3667 KokkosSparse::spadd_numeric(&handle,
3668 nrows, A.numCols(),
3669 Arowptrs, AcolindsConverted, Avals, scalarA,
3670 Browptrs, BcolindsConverted, Bvals, scalarB,
3671 Crowptrs, Ccolinds, Cvals);
3672}
3673
3674} // namespace MMdetails
3675
3676} // End namespace Tpetra
3677
3678/*********************************************************************************************************/
3679//
3680// Explicit instantiation macro
3681//
3682// Must be expanded from within the Tpetra namespace!
3683//
3684namespace Tpetra {
3685
3686#define TPETRA_MATRIXMATRIX_INSTANT(SCALAR, LO, GO, NODE) \
3687 template void MatrixMatrix::Multiply( \
3688 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3689 bool transposeA, \
3690 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3691 bool transposeB, \
3692 CrsMatrix<SCALAR, LO, GO, NODE>& C, \
3693 bool call_FillComplete_on_result, \
3694 const std::string& label, \
3695 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3696 \
3697 template void MatrixMatrix::Multiply( \
3698 const Teuchos::RCP<const BlockCrsMatrix<SCALAR, LO, GO, NODE>>& A, \
3699 bool transposeA, \
3700 const Teuchos::RCP<const BlockCrsMatrix<SCALAR, LO, GO, NODE>>& B, \
3701 bool transposeB, \
3702 Teuchos::RCP<BlockCrsMatrix<SCALAR, LO, GO, NODE>>& C, \
3703 const std::string& label); \
3704 \
3705 template void MatrixMatrix::Jacobi( \
3706 typename Teuchos::ScalarTraits<SCALAR>::magnitudeType omega, \
3707 const Vector<SCALAR, LO, GO, NODE>& Dinv, \
3708 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3709 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3710 CrsMatrix<SCALAR, LO, GO, NODE>& C, \
3711 bool call_FillComplete_on_result, \
3712 const std::string& label, \
3713 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3714 \
3715 template void MatrixMatrix::Add( \
3716 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3717 bool transposeA, \
3718 SCALAR scalarA, \
3719 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3720 bool transposeB, \
3721 SCALAR scalarB, \
3722 Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE>>& C); \
3723 \
3724 template void MatrixMatrix::Add( \
3725 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3726 bool transposeA, \
3727 SCALAR scalarA, \
3728 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3729 bool transposeB, \
3730 SCALAR scalarB, \
3731 const Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE>>& C); \
3732 \
3733 template void MatrixMatrix::Add( \
3734 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3735 bool transposeA, \
3736 SCALAR scalarA, \
3737 CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3738 SCALAR scalarB); \
3739 \
3740 template Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE>> \
3741 MatrixMatrix::add<SCALAR, LO, GO, NODE>(const SCALAR& alpha, \
3742 const bool transposeA, \
3743 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3744 const SCALAR& beta, \
3745 const bool transposeB, \
3746 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3747 const Teuchos::RCP<const Map<LO, GO, NODE>>& domainMap, \
3748 const Teuchos::RCP<const Map<LO, GO, NODE>>& rangeMap, \
3749 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3750 \
3751 template void \
3752 MatrixMatrix::add<SCALAR, LO, GO, NODE>(const SCALAR& alpha, \
3753 const bool transposeA, \
3754 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3755 const SCALAR& beta, \
3756 const bool transposeB, \
3757 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3758 CrsMatrix<SCALAR, LO, GO, NODE>& C, \
3759 const Teuchos::RCP<const Map<LO, GO, NODE>>& domainMap, \
3760 const Teuchos::RCP<const Map<LO, GO, NODE>>& rangeMap, \
3761 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3762 \
3763 template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>; \
3764 \
3765 template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const CrsMatrix<SCALAR, LO, GO, NODE>& M, \
3766 Teuchos::RCP<const Map<LO, GO, NODE>> targetMap, \
3767 CrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3768 Teuchos::RCP<const Import<LO, GO, NODE>> prototypeImporter, \
3769 bool userAssertsThereAreNoRemotes, \
3770 const std::string& label, \
3771 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3772 \
3773 template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const BlockCrsMatrix<SCALAR, LO, GO, NODE>& M, \
3774 Teuchos::RCP<const Map<LO, GO, NODE>> targetMap, \
3775 BlockCrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3776 Teuchos::RCP<const Import<LO, GO, NODE>> prototypeImporter, \
3777 bool userAssertsThereAreNoRemotes);
3778} // End namespace Tpetra
3779
3780#endif // TPETRA_MATRIXMATRIX_DEF_HPP
Matrix Market file readers and writers for Tpetra objects.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Declaration and definition of Tpetra::Details::getEntryOnHost.
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Stand-alone utility functions and macros.
Forward declaration of some Tpetra Matrix Matrix objects.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Struct that holds views of the contents of a CrsMatrix.
Teuchos::RCP< const map_type > colMap
Col map for the original version of the matrix.
Teuchos::RCP< const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > origMatrix
The original matrix.
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
void start()
Start the deep_copy counter.
void Jacobi(typename Teuchos::ScalarTraits< Scalar >::magnitudeType omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void removeCrsMatrixZeros(CrsMatrixType &matrix, typename Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitudeType const &threshold=Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitude(Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::zero()))
Remove zero entries from a matrix.