10#ifndef TPETRA_DETAILS_RESIDUAL_HPP
11#define TPETRA_DETAILS_RESIDUAL_HPP
13#include "TpetraCore_config.h"
14#include "Tpetra_CrsMatrix.hpp"
15#include "Tpetra_LocalCrsMatrixOperator.hpp"
16#include "Tpetra_MultiVector.hpp"
17#include "Teuchos_RCP.hpp"
18#include "Teuchos_ScalarTraits.hpp"
21#include "KokkosSparse_spmv_impl.hpp"
38template <
class AMatrix,
class MV,
class ConstMV,
class Offsets,
bool is_MV,
bool restrictedMode,
bool skipOffRank>
40 using execution_space =
typename AMatrix::execution_space;
41 using LO =
typename AMatrix::non_const_ordinal_type;
42 using value_type =
typename AMatrix::non_const_value_type;
43 using team_policy =
typename Kokkos::TeamPolicy<execution_space>;
44 using team_member =
typename team_policy::member_type;
45 using ATV = KokkosKernels::ArithTraits<value_type>;
71 void operator()(
const team_member&
dev)
const {
72 Kokkos::parallel_for(Kokkos::TeamThreadRange(
dev, 0, rows_per_team), [&](
const LO&
loop) {
73 const LO
lclRow =
static_cast<LO
>(
dev.league_rank()) * rows_per_team +
loop;
75 if (
lclRow >= A_lcl.numRows()) {
81 value_type
A_x = ATV::zero();
87 Kokkos::parallel_reduce(
96 const size_t start = A_lcl.graph.row_map(
lclRow);
97 const size_t end = A_lcl.graph.row_map(
lclRow + 1);
99 Kokkos::parallel_reduce(
100 Kokkos::ThreadVectorRange(
dev, start,
end), [&](
const LO
iEntry, value_type&
lsum) {
111 Kokkos::single(Kokkos::PerThread(
dev), [&]() {
116 const LO
numVectors =
static_cast<LO
>(X_colmap_lcl.extent(1));
119 value_type
A_x = ATV::zero();
125 Kokkos::parallel_reduce(
133 const size_t start = A_lcl.graph.row_map(
lclRow);
134 const size_t end = A_lcl.graph.row_map(
lclRow + 1);
136 Kokkos::parallel_reduce(
137 Kokkos::ThreadVectorRange(
dev, start,
end), [&](
const LO
iEntry, value_type&
lsum) {
148 Kokkos::single(Kokkos::PerThread(
dev), [&]() {
159template <
class AMatrix,
class MV,
class ConstMV,
class Offsets,
bool is_MV>
161 using execution_space =
typename AMatrix::execution_space;
162 using LO =
typename AMatrix::non_const_ordinal_type;
163 using value_type =
typename AMatrix::non_const_value_type;
164 using team_policy =
typename Kokkos::TeamPolicy<execution_space>;
165 using team_member =
typename team_policy::member_type;
166 using ATV = KokkosKernels::ArithTraits<value_type>;
183 , offsets(offsets_) {}
186 void operator()(
const team_member&
dev)
const {
187 Kokkos::parallel_for(Kokkos::TeamThreadRange(
dev, 0, rows_per_team), [&](
const LO&
loop) {
188 const LO
lclRow =
static_cast<LO
>(
dev.league_rank()) * rows_per_team +
loop;
190 if (
lclRow >= A_lcl.numRows()) {
195 const size_t end = A_lcl.graph.row_map(
lclRow + 1);
199 value_type
A_x = ATV::zero();
201 Kokkos::parallel_reduce(
209 Kokkos::single(Kokkos::PerThread(
dev), [&]() {
214 const LO
numVectors =
static_cast<LO
>(X_colmap_lcl.extent(1));
217 value_type
A_x = ATV::zero();
219 Kokkos::parallel_reduce(
227 Kokkos::single(Kokkos::PerThread(
dev), [&]() {
237template <
class SC,
class LO,
class GO,
class NO>
242 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
244 using Teuchos::NO_TRANS;
251 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
253 local_matrix_device_type A_lcl =
A.getLocalMatrixDevice();
262 "X.getNumVectors() = " <<
X_colmap.getNumVectors() <<
" != "
263 "R.getNumVectors() = "
264 <<
R.getNumVectors() <<
".");
266 A.getColMap()->getLocalNumElements(),
268 "X has the wrong number of local rows. "
269 "X.getLocalLength() = "
270 <<
X_colmap.getLocalLength() <<
" != "
271 "A.getColMap()->getLocalNumElements() = "
272 <<
A.getColMap()->getLocalNumElements() <<
".");
274 A.getRowMap()->getLocalNumElements(),
276 "R has the wrong number of local rows. "
277 "R.getLocalLength() = "
278 <<
R.getLocalLength() <<
" != "
279 "A.getRowMap()->getLocalNumElements() = "
280 <<
A.getRowMap()->getLocalNumElements() <<
".");
282 A.getRowMap()->getLocalNumElements(),
284 "B has the wrong number of local rows. "
285 "B.getLocalLength() = "
286 <<
B.getLocalLength() <<
" != "
287 "A.getRowMap()->getLocalNumElements() = "
288 <<
A.getRowMap()->getLocalNumElements() <<
".");
291 "The matrix A is not "
292 "fill complete. You must call fillComplete() (possibly with "
293 "domain and range Map arguments) without an intervening "
294 "resumeFill() call before you may call this method.");
296 std::runtime_error,
"X, Y and B must be constant stride.");
300 (X_colmap_lcl.data() == B_lcl.data() && X_colmap_lcl.data() !=
nullptr),
301 std::runtime_error,
"X, Y and R may not alias one another.");
305 if (!fusedResidual) {
306 SC one = Teuchos::ScalarTraits<SC>::one();
308 SC zero = Teuchos::ScalarTraits<SC>::zero();
316 if (A_lcl.numRows() == 0) {
320 int64_t numLocalRows = A_lcl.numRows();
321 int64_t myNnz = A_lcl.nnz();
324 int vector_length = -1;
325 int64_t rows_per_thread = -1;
328 using policy_type =
typename Kokkos::TeamPolicy<execution_space>;
330 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
331 int64_t worksets = (B_lcl.extent(0) + rows_per_team - 1) / rows_per_team;
333 policy_type policy(1, 1);
335 policy = policy_type(worksets, Kokkos::AUTO, vector_length);
337 policy = policy_type(worksets, team_size, vector_length);
340 bool is_vector = (X_colmap_lcl.extent(1) == 1);
343 if (X_domainmap ==
nullptr) {
344 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, false, false>;
345 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
346 Kokkos::parallel_for(
"residual-vector", policy, func);
349 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
350 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, true, false>;
351 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
352 Kokkos::parallel_for(
"residual-vector", policy, func);
355 if (X_domainmap ==
nullptr) {
356 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, false, false>;
357 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
358 Kokkos::parallel_for(
"residual-multivector", policy, func);
361 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
362 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, true, false>;
363 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
364 Kokkos::parallel_for(
"residual-multivector", policy, func);
369template <
class SC,
class LO,
class GO,
class NO>
370void localResidualWithCommCompOverlap(
const CrsMatrix<SC, LO, GO, NO>& A,
371 MultiVector<SC, LO, GO, NO>& X_colmap,
372 const MultiVector<SC, LO, GO, NO>& B,
373 MultiVector<SC, LO, GO, NO>& R,
374 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
375 const MultiVector<SC, LO, GO, NO>& X_domainmap) {
376 using Teuchos::NO_TRANS;
381 ProfilingRegion regionLocalApply(
"Tpetra::CrsMatrix::localResidualWithCommCompOverlap");
384 using local_view_device_type =
typename MultiVector<SC, LO, GO, NO>::dual_view_type::t_dev::non_const_type;
385 using const_local_view_device_type =
typename MultiVector<SC, LO, GO, NO>::dual_view_type::t_dev::const_type;
386 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
388 local_matrix_device_type A_lcl = A.getLocalMatrixDevice();
389 const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
390 const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
391 local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
392 const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);
397 TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getNumVectors() != R.getNumVectors(), std::runtime_error,
398 "X.getNumVectors() = " << X_colmap.getNumVectors() <<
" != "
399 "R.getNumVectors() = "
400 << R.getNumVectors() <<
".");
401 TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getLocalLength() !=
402 A.getColMap()->getLocalNumElements(),
404 "X has the wrong number of local rows. "
405 "X.getLocalLength() = "
406 << X_colmap.getLocalLength() <<
" != "
407 "A.getColMap()->getLocalNumElements() = "
408 << A.getColMap()->getLocalNumElements() <<
".");
409 TEUCHOS_TEST_FOR_EXCEPTION(R.getLocalLength() !=
410 A.getRowMap()->getLocalNumElements(),
412 "R has the wrong number of local rows. "
413 "R.getLocalLength() = "
414 << R.getLocalLength() <<
" != "
415 "A.getRowMap()->getLocalNumElements() = "
416 << A.getRowMap()->getLocalNumElements() <<
".");
417 TEUCHOS_TEST_FOR_EXCEPTION(B.getLocalLength() !=
418 A.getRowMap()->getLocalNumElements(),
420 "B has the wrong number of local rows. "
421 "B.getLocalLength() = "
422 << B.getLocalLength() <<
" != "
423 "A.getRowMap()->getLocalNumElements() = "
424 << A.getRowMap()->getLocalNumElements() <<
".");
426 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
427 "The matrix A is not "
428 "fill complete. You must call fillComplete() (possibly with "
429 "domain and range Map arguments) without an intervening "
430 "resumeFill() call before you may call this method.");
431 TEUCHOS_TEST_FOR_EXCEPTION(!X_colmap.isConstantStride() || !R.isConstantStride() || !B.isConstantStride(),
432 std::runtime_error,
"X, Y and B must be constant stride.");
435 TEUCHOS_TEST_FOR_EXCEPTION((X_colmap_lcl.data() == R_lcl.data() && X_colmap_lcl.data() !=
nullptr) ||
436 (X_colmap_lcl.data() == B_lcl.data() && X_colmap_lcl.data() !=
nullptr),
437 std::runtime_error,
"X, Y and R may not alias one another.");
440 if (A_lcl.numRows() == 0) {
444 int64_t numLocalRows = A_lcl.numRows();
445 int64_t myNnz = A_lcl.nnz();
448 int vector_length = -1;
449 int64_t rows_per_thread = -1;
452 using policy_type =
typename Kokkos::TeamPolicy<execution_space>;
454 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
455 int64_t worksets = (B_lcl.extent(0) + rows_per_team - 1) / rows_per_team;
457 policy_type policy(1, 1);
459 policy = policy_type(worksets, Kokkos::AUTO, vector_length);
461 policy = policy_type(worksets, team_size, vector_length);
464 bool is_vector = (X_colmap_lcl.extent(1) == 1);
467 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, true, true>;
468 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
469 Kokkos::parallel_for(
"residual-vector", policy, func);
471 RCP<const import_type> importer = A.getGraph()->getImporter();
472 X_colmap.endImport(X_domainmap, *importer,
INSERT,
true);
474 Kokkos::fence(
"Tpetra::localResidualWithCommCompOverlap-1");
476 using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false>;
477 functor_type2 func2(A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
478 Kokkos::parallel_for(
"residual-vector-offrank", policy, func2);
481 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, true, true>;
482 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
483 Kokkos::parallel_for(
"residual-multivector", policy, func);
485 RCP<const import_type> importer = A.getGraph()->getImporter();
486 X_colmap.endImport(X_domainmap, *importer,
INSERT,
true);
488 Kokkos::fence(
"Tpetra::localResidualWithCommCompOverlap-2");
490 using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true>;
491 functor_type2 func2(A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
492 Kokkos::parallel_for(
"residual-vector-offrank", policy, func2);
497template <
class SC,
class LO,
class GO,
class NO>
504 using Teuchos::rcp_const_cast;
505 using Teuchos::rcpFromRef;
511 if (overlapCommunicationAndComputation)
524 SC one = Teuchos::ScalarTraits<SC>::one(),
negone = -
one,
zero = Teuchos::ScalarTraits<SC>::zero();
535 using offset_type =
typename graph_type::offset_device_view_type;
539 (!
R_in.isDistributed() &&
A.getComm()->getSize() != 1);
555 if (!
X_in.isConstantStride()) {
583 "Source map and target map are not locally fitted, but Tpetra::residual thinks they are.");
592 A.getCrsGraph()->getLocalOffRankOffsets(offsets);
599 if (!
R_in.isConstantStride()) {
612 if (!
B_in.isConstantStride()) {
652 if (overlapCommunicationAndComputation) {
670 if (!
R_in.isConstantStride()) {
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
typename device_type::execution_space execution_space
The Kokkos execution space.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Import< LocalOrdinal, GlobalOrdinal, Node > import_type
The Import specialization suitable for this CrsMatrix specialization.
Struct that holds views of the contents of a CrsMatrix.
static bool fusedResidual()
Fusing SpMV and update in residual instead of using 2 kernel launches. Fusing kernels implies that no...
static bool debug()
Whether Tpetra is in debug mode.
static bool overlapCommunicationAndComputation()
Overlap communication and computation.
static bool skipCopyAndPermuteIfPossible()
Skip copyAndPermute if possible.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
One or more distributed dense vectors.
Implementation details of Tpetra.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
@ INSERT
Insert new values that don't currently exist.
Functor for computing the residual.
Functor for computing R -= A_offRank*X_colmap.