10#ifndef TPETRA_DETAILS_RESIDUAL_HPP
11#define TPETRA_DETAILS_RESIDUAL_HPP
13#include "TpetraCore_config.h"
14#include "Tpetra_CrsMatrix.hpp"
15#include "Tpetra_LocalCrsMatrixOperator.hpp"
16#include "Tpetra_MultiVector.hpp"
17#include "Teuchos_RCP.hpp"
18#include "Teuchos_ScalarTraits.hpp"
21#include "KokkosSparse_spmv_impl.hpp"
38template <
class AMatrix,
class MV,
class ConstMV,
class Offsets,
bool is_MV,
bool restrictedMode,
bool skipOffRank>
40 using execution_space =
typename AMatrix::execution_space;
41 using LO =
typename AMatrix::non_const_ordinal_type;
42 using value_type =
typename AMatrix::non_const_value_type;
43 using team_policy =
typename Kokkos::TeamPolicy<execution_space>;
44 using team_member =
typename team_policy::member_type;
45#if KOKKOS_VERSION >= 40799
46 using ATV = KokkosKernels::ArithTraits<value_type>;
48 using ATV = Kokkos::ArithTraits<value_type>;
75 void operator()(
const team_member&
dev)
const {
76 Kokkos::parallel_for(Kokkos::TeamThreadRange(
dev, 0, rows_per_team), [&](
const LO&
loop) {
77 const LO
lclRow =
static_cast<LO
>(
dev.league_rank()) * rows_per_team +
loop;
79 if (
lclRow >= A_lcl.numRows()) {
85 value_type
A_x = ATV::zero();
91 Kokkos::parallel_reduce(
100 const size_t start = A_lcl.graph.row_map(
lclRow);
101 const size_t end = A_lcl.graph.row_map(
lclRow + 1);
103 Kokkos::parallel_reduce(
104 Kokkos::ThreadVectorRange(
dev, start,
end), [&](
const LO
iEntry, value_type&
lsum) {
115 Kokkos::single(Kokkos::PerThread(
dev), [&]() {
120 const LO
numVectors =
static_cast<LO
>(X_colmap_lcl.extent(1));
123 value_type
A_x = ATV::zero();
129 Kokkos::parallel_reduce(
137 const size_t start = A_lcl.graph.row_map(
lclRow);
138 const size_t end = A_lcl.graph.row_map(
lclRow + 1);
140 Kokkos::parallel_reduce(
141 Kokkos::ThreadVectorRange(
dev, start,
end), [&](
const LO
iEntry, value_type&
lsum) {
152 Kokkos::single(Kokkos::PerThread(
dev), [&]() {
163template <
class AMatrix,
class MV,
class ConstMV,
class Offsets,
bool is_MV>
165 using execution_space =
typename AMatrix::execution_space;
166 using LO =
typename AMatrix::non_const_ordinal_type;
167 using value_type =
typename AMatrix::non_const_value_type;
168 using team_policy =
typename Kokkos::TeamPolicy<execution_space>;
169 using team_member =
typename team_policy::member_type;
170#if KOKKOS_VERSION >= 40799
171 using ATV = KokkosKernels::ArithTraits<value_type>;
173 using ATV = Kokkos::ArithTraits<value_type>;
191 , offsets(offsets_) {}
194 void operator()(
const team_member&
dev)
const {
195 Kokkos::parallel_for(Kokkos::TeamThreadRange(
dev, 0, rows_per_team), [&](
const LO&
loop) {
196 const LO
lclRow =
static_cast<LO
>(
dev.league_rank()) * rows_per_team +
loop;
198 if (
lclRow >= A_lcl.numRows()) {
203 const size_t end = A_lcl.graph.row_map(
lclRow + 1);
207 value_type
A_x = ATV::zero();
209 Kokkos::parallel_reduce(
217 Kokkos::single(Kokkos::PerThread(
dev), [&]() {
222 const LO
numVectors =
static_cast<LO
>(X_colmap_lcl.extent(1));
225 value_type
A_x = ATV::zero();
227 Kokkos::parallel_reduce(
235 Kokkos::single(Kokkos::PerThread(
dev), [&]() {
245template <
class SC,
class LO,
class GO,
class NO>
250 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
252 using Teuchos::NO_TRANS;
259 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
261 local_matrix_device_type A_lcl =
A.getLocalMatrixDevice();
270 "X.getNumVectors() = " <<
X_colmap.getNumVectors() <<
" != "
271 "R.getNumVectors() = "
272 <<
R.getNumVectors() <<
".");
274 A.getColMap()->getLocalNumElements(),
276 "X has the wrong number of local rows. "
277 "X.getLocalLength() = "
278 <<
X_colmap.getLocalLength() <<
" != "
279 "A.getColMap()->getLocalNumElements() = "
280 <<
A.getColMap()->getLocalNumElements() <<
".");
282 A.getRowMap()->getLocalNumElements(),
284 "R has the wrong number of local rows. "
285 "R.getLocalLength() = "
286 <<
R.getLocalLength() <<
" != "
287 "A.getRowMap()->getLocalNumElements() = "
288 <<
A.getRowMap()->getLocalNumElements() <<
".");
290 A.getRowMap()->getLocalNumElements(),
292 "B has the wrong number of local rows. "
293 "B.getLocalLength() = "
294 <<
B.getLocalLength() <<
" != "
295 "A.getRowMap()->getLocalNumElements() = "
296 <<
A.getRowMap()->getLocalNumElements() <<
".");
299 "The matrix A is not "
300 "fill complete. You must call fillComplete() (possibly with "
301 "domain and range Map arguments) without an intervening "
302 "resumeFill() call before you may call this method.");
304 std::runtime_error,
"X, Y and B must be constant stride.");
308 (X_colmap_lcl.data() == B_lcl.data() && X_colmap_lcl.data() !=
nullptr),
309 std::runtime_error,
"X, Y and R may not alias one another.");
313 if (!fusedResidual) {
314 SC one = Teuchos::ScalarTraits<SC>::one();
316 SC zero = Teuchos::ScalarTraits<SC>::zero();
324 if (A_lcl.numRows() == 0) {
328 int64_t numLocalRows = A_lcl.numRows();
329 int64_t myNnz = A_lcl.nnz();
332 int vector_length = -1;
333 int64_t rows_per_thread = -1;
336 using policy_type =
typename Kokkos::TeamPolicy<execution_space>;
338 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
339 int64_t worksets = (B_lcl.extent(0) + rows_per_team - 1) / rows_per_team;
341 policy_type policy(1, 1);
343 policy = policy_type(worksets, Kokkos::AUTO, vector_length);
345 policy = policy_type(worksets, team_size, vector_length);
348 bool is_vector = (X_colmap_lcl.extent(1) == 1);
351 if (X_domainmap ==
nullptr) {
352 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, false, false>;
353 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
354 Kokkos::parallel_for(
"residual-vector", policy, func);
357 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
358 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, true, false>;
359 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
360 Kokkos::parallel_for(
"residual-vector", policy, func);
363 if (X_domainmap ==
nullptr) {
364 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, false, false>;
365 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
366 Kokkos::parallel_for(
"residual-multivector", policy, func);
369 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
370 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, true, false>;
371 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
372 Kokkos::parallel_for(
"residual-multivector", policy, func);
377template <
class SC,
class LO,
class GO,
class NO>
378void localResidualWithCommCompOverlap(
const CrsMatrix<SC, LO, GO, NO>& A,
379 MultiVector<SC, LO, GO, NO>& X_colmap,
380 const MultiVector<SC, LO, GO, NO>& B,
381 MultiVector<SC, LO, GO, NO>& R,
382 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
383 const MultiVector<SC, LO, GO, NO>& X_domainmap) {
384 using Teuchos::NO_TRANS;
389 ProfilingRegion regionLocalApply(
"Tpetra::CrsMatrix::localResidualWithCommCompOverlap");
392 using local_view_device_type =
typename MultiVector<SC, LO, GO, NO>::dual_view_type::t_dev::non_const_type;
393 using const_local_view_device_type =
typename MultiVector<SC, LO, GO, NO>::dual_view_type::t_dev::const_type;
394 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
396 local_matrix_device_type A_lcl = A.getLocalMatrixDevice();
397 const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
398 const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
399 local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
400 const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);
405 TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getNumVectors() != R.getNumVectors(), std::runtime_error,
406 "X.getNumVectors() = " << X_colmap.getNumVectors() <<
" != "
407 "R.getNumVectors() = "
408 << R.getNumVectors() <<
".");
409 TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getLocalLength() !=
410 A.getColMap()->getLocalNumElements(),
412 "X has the wrong number of local rows. "
413 "X.getLocalLength() = "
414 << X_colmap.getLocalLength() <<
" != "
415 "A.getColMap()->getLocalNumElements() = "
416 << A.getColMap()->getLocalNumElements() <<
".");
417 TEUCHOS_TEST_FOR_EXCEPTION(R.getLocalLength() !=
418 A.getRowMap()->getLocalNumElements(),
420 "R has the wrong number of local rows. "
421 "R.getLocalLength() = "
422 << R.getLocalLength() <<
" != "
423 "A.getRowMap()->getLocalNumElements() = "
424 << A.getRowMap()->getLocalNumElements() <<
".");
425 TEUCHOS_TEST_FOR_EXCEPTION(B.getLocalLength() !=
426 A.getRowMap()->getLocalNumElements(),
428 "B has the wrong number of local rows. "
429 "B.getLocalLength() = "
430 << B.getLocalLength() <<
" != "
431 "A.getRowMap()->getLocalNumElements() = "
432 << A.getRowMap()->getLocalNumElements() <<
".");
434 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
435 "The matrix A is not "
436 "fill complete. You must call fillComplete() (possibly with "
437 "domain and range Map arguments) without an intervening "
438 "resumeFill() call before you may call this method.");
439 TEUCHOS_TEST_FOR_EXCEPTION(!X_colmap.isConstantStride() || !R.isConstantStride() || !B.isConstantStride(),
440 std::runtime_error,
"X, Y and B must be constant stride.");
443 TEUCHOS_TEST_FOR_EXCEPTION((X_colmap_lcl.data() == R_lcl.data() && X_colmap_lcl.data() !=
nullptr) ||
444 (X_colmap_lcl.data() == B_lcl.data() && X_colmap_lcl.data() !=
nullptr),
445 std::runtime_error,
"X, Y and R may not alias one another.");
448 if (A_lcl.numRows() == 0) {
452 int64_t numLocalRows = A_lcl.numRows();
453 int64_t myNnz = A_lcl.nnz();
456 int vector_length = -1;
457 int64_t rows_per_thread = -1;
460 using policy_type =
typename Kokkos::TeamPolicy<execution_space>;
462 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
463 int64_t worksets = (B_lcl.extent(0) + rows_per_team - 1) / rows_per_team;
465 policy_type policy(1, 1);
467 policy = policy_type(worksets, Kokkos::AUTO, vector_length);
469 policy = policy_type(worksets, team_size, vector_length);
472 bool is_vector = (X_colmap_lcl.extent(1) == 1);
475 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, true, true>;
476 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
477 Kokkos::parallel_for(
"residual-vector", policy, func);
479 RCP<const import_type> importer = A.getGraph()->getImporter();
480 X_colmap.endImport(X_domainmap, *importer,
INSERT,
true);
482 Kokkos::fence(
"Tpetra::localResidualWithCommCompOverlap-1");
484 using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false>;
485 functor_type2 func2(A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
486 Kokkos::parallel_for(
"residual-vector-offrank", policy, func2);
489 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, true, true>;
490 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
491 Kokkos::parallel_for(
"residual-multivector", policy, func);
493 RCP<const import_type> importer = A.getGraph()->getImporter();
494 X_colmap.endImport(X_domainmap, *importer,
INSERT,
true);
496 Kokkos::fence(
"Tpetra::localResidualWithCommCompOverlap-2");
498 using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true>;
499 functor_type2 func2(A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
500 Kokkos::parallel_for(
"residual-vector-offrank", policy, func2);
505template <
class SC,
class LO,
class GO,
class NO>
512 using Teuchos::rcp_const_cast;
513 using Teuchos::rcpFromRef;
519 if (overlapCommunicationAndComputation)
532 SC one = Teuchos::ScalarTraits<SC>::one(),
negone = -
one,
zero = Teuchos::ScalarTraits<SC>::zero();
543 using offset_type =
typename graph_type::offset_device_view_type;
547 (!
R_in.isDistributed() &&
A.getComm()->getSize() != 1);
563 if (!
X_in.isConstantStride()) {
591 "Source map and target map are not locally fitted, but Tpetra::residual thinks they are.");
600 A.getCrsGraph()->getLocalOffRankOffsets(offsets);
607 if (!
R_in.isConstantStride()) {
620 if (!
B_in.isConstantStride()) {
660 if (overlapCommunicationAndComputation) {
678 if (!
R_in.isConstantStride()) {
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Struct that holds views of the contents of a CrsMatrix.
typename device_type::execution_space execution_space
The Kokkos execution space.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Import< LocalOrdinal, GlobalOrdinal, Node > import_type
The Import specialization suitable for this CrsMatrix specialization.
static bool fusedResidual()
Fusing SpMV and update in residual instead of using 2 kernel launches. Fusing kernels implies that no...
static bool debug()
Whether Tpetra is in debug mode.
static bool overlapCommunicationAndComputation()
Overlap communication and computation.
static bool skipCopyAndPermuteIfPossible()
Skip copyAndPermute if possible.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
One or more distributed dense vectors.
Implementation details of Tpetra.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
@ INSERT
Insert new values that don't currently exist.
Functor for computing the residual.
Functor for computing R -= A_offRank*X_colmap.