Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_residual.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_RESIDUAL_HPP
11#define TPETRA_DETAILS_RESIDUAL_HPP
12
13#include "TpetraCore_config.h"
14#include "Tpetra_CrsMatrix.hpp"
15#include "Tpetra_LocalCrsMatrixOperator.hpp"
16#include "Tpetra_MultiVector.hpp"
17#include "Teuchos_RCP.hpp"
18#include "Teuchos_ScalarTraits.hpp"
21#include "KokkosSparse_spmv_impl.hpp"
22
28
29namespace Tpetra {
30namespace Details {
31
38template <class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV, bool restrictedMode, bool skipOffRank>
40 using execution_space = typename AMatrix::execution_space;
41 using LO = typename AMatrix::non_const_ordinal_type;
42 using value_type = typename AMatrix::non_const_value_type;
43 using team_policy = typename Kokkos::TeamPolicy<execution_space>;
44 using team_member = typename team_policy::member_type;
45#if KOKKOS_VERSION >= 40799
46 using ATV = KokkosKernels::ArithTraits<value_type>;
47#else
48 using ATV = Kokkos::ArithTraits<value_type>;
49#endif
50
51 AMatrix A_lcl;
52 ConstMV X_colmap_lcl;
53 ConstMV B_lcl;
54 MV R_lcl;
55 int rows_per_team;
56 Offsets offsets;
57 ConstMV X_domainmap_lcl;
58
59 LocalResidualFunctor(const AMatrix& A_lcl_,
61 const ConstMV& B_lcl_,
62 const MV& R_lcl_,
63 const int rows_per_team_,
64 const Offsets& offsets_,
66 : A_lcl(A_lcl_)
67 , X_colmap_lcl(X_colmap_lcl_)
68 , B_lcl(B_lcl_)
69 , R_lcl(R_lcl_)
70 , rows_per_team(rows_per_team_)
71 , offsets(offsets_)
72 , X_domainmap_lcl(X_domainmap_lcl_) {}
73
75 void operator()(const team_member& dev) const {
76 Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&](const LO& loop) {
77 const LO lclRow = static_cast<LO>(dev.league_rank()) * rows_per_team + loop;
78
79 if (lclRow >= A_lcl.numRows()) {
80 return;
81 }
82
83 if (!is_MV) { // MultiVectors only have a single column
84
85 value_type A_x = ATV::zero();
86
87 if (!restrictedMode) {
88 const auto A_row = A_lcl.rowConst(lclRow);
89 const LO row_length = static_cast<LO>(A_row.length);
90
91 Kokkos::parallel_reduce(
92 Kokkos::ThreadVectorRange(dev, row_length), [&](const LO iEntry, value_type& lsum) {
93 const auto A_val = A_row.value(iEntry);
94 lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry), 0);
95 },
96 A_x);
97
98 } else {
99 const LO offRankOffset = offsets(lclRow);
100 const size_t start = A_lcl.graph.row_map(lclRow);
101 const size_t end = A_lcl.graph.row_map(lclRow + 1);
102
103 Kokkos::parallel_reduce(
104 Kokkos::ThreadVectorRange(dev, start, end), [&](const LO iEntry, value_type& lsum) {
105 const auto A_val = A_lcl.values(iEntry);
106 const auto lclCol = A_lcl.graph.entries(iEntry);
107 if (iEntry < offRankOffset)
108 lsum += A_val * X_domainmap_lcl(lclCol, 0);
109 else if (!skipOffRank)
110 lsum += A_val * X_colmap_lcl(lclCol, 0);
111 },
112 A_x);
113 }
114
115 Kokkos::single(Kokkos::PerThread(dev), [&]() {
116 R_lcl(lclRow, 0) = B_lcl(lclRow, 0) - A_x;
117 });
118 } else { // MultiVectors have more than one column
119
120 const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
121
122 for (LO v = 0; v < numVectors; v++) {
123 value_type A_x = ATV::zero();
124
125 if (!restrictedMode) {
126 const auto A_row = A_lcl.rowConst(lclRow);
127 const LO row_length = static_cast<LO>(A_row.length);
128
129 Kokkos::parallel_reduce(
130 Kokkos::ThreadVectorRange(dev, row_length), [&](const LO iEntry, value_type& lsum) {
131 const auto A_val = A_row.value(iEntry);
132 lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry), v);
133 },
134 A_x);
135 } else {
136 const LO offRankOffset = offsets(lclRow);
137 const size_t start = A_lcl.graph.row_map(lclRow);
138 const size_t end = A_lcl.graph.row_map(lclRow + 1);
139
140 Kokkos::parallel_reduce(
141 Kokkos::ThreadVectorRange(dev, start, end), [&](const LO iEntry, value_type& lsum) {
142 const auto A_val = A_lcl.values(iEntry);
143 const auto lclCol = A_lcl.graph.entries(iEntry);
144 if (iEntry < offRankOffset)
145 lsum += A_val * X_domainmap_lcl(lclCol, v);
146 else if (!skipOffRank)
147 lsum += A_val * X_colmap_lcl(lclCol, v);
148 },
149 A_x);
150 }
151
152 Kokkos::single(Kokkos::PerThread(dev), [&]() {
153 R_lcl(lclRow, v) = B_lcl(lclRow, v) - A_x;
154 });
155
156 } // end for numVectors
157 }
158 }); // end parallel_for TeamThreadRange
159 }
160};
161
163template <class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV>
165 using execution_space = typename AMatrix::execution_space;
166 using LO = typename AMatrix::non_const_ordinal_type;
167 using value_type = typename AMatrix::non_const_value_type;
168 using team_policy = typename Kokkos::TeamPolicy<execution_space>;
169 using team_member = typename team_policy::member_type;
170#if KOKKOS_VERSION >= 40799
171 using ATV = KokkosKernels::ArithTraits<value_type>;
172#else
173 using ATV = Kokkos::ArithTraits<value_type>;
174#endif
175
176 AMatrix A_lcl;
177 ConstMV X_colmap_lcl;
178 MV R_lcl;
179 int rows_per_team;
180 Offsets offsets;
181
182 OffRankUpdateFunctor(const AMatrix& A_lcl_,
183 const ConstMV& X_colmap_lcl_,
184 const MV& R_lcl_,
185 const int rows_per_team_,
186 const Offsets& offsets_)
187 : A_lcl(A_lcl_)
188 , X_colmap_lcl(X_colmap_lcl_)
189 , R_lcl(R_lcl_)
190 , rows_per_team(rows_per_team_)
191 , offsets(offsets_) {}
192
194 void operator()(const team_member& dev) const {
195 Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&](const LO& loop) {
196 const LO lclRow = static_cast<LO>(dev.league_rank()) * rows_per_team + loop;
197
198 if (lclRow >= A_lcl.numRows()) {
199 return;
200 }
201
202 const LO offRankOffset = offsets(lclRow);
203 const size_t end = A_lcl.graph.row_map(lclRow + 1);
204
205 if (!is_MV) { // MultiVectors only have a single column
206
207 value_type A_x = ATV::zero();
208
209 Kokkos::parallel_reduce(
210 Kokkos::ThreadVectorRange(dev, offRankOffset, end), [&](const LO iEntry, value_type& lsum) {
211 const auto A_val = A_lcl.values(iEntry);
212 const auto lclCol = A_lcl.graph.entries(iEntry);
213 lsum += A_val * X_colmap_lcl(lclCol, 0);
214 },
215 A_x);
216
217 Kokkos::single(Kokkos::PerThread(dev), [&]() {
218 R_lcl(lclRow, 0) -= A_x;
219 });
220 } else { // MultiVectors have more than one column
221
222 const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
223
224 for (LO v = 0; v < numVectors; v++) {
225 value_type A_x = ATV::zero();
226
227 Kokkos::parallel_reduce(
228 Kokkos::ThreadVectorRange(dev, offRankOffset, end), [&](const LO iEntry, value_type& lsum) {
229 const auto A_val = A_lcl.values(iEntry);
230 const auto lclCol = A_lcl.graph.entries(iEntry);
231 lsum += A_val * X_colmap_lcl(lclCol, v);
232 },
233 A_x);
234
235 Kokkos::single(Kokkos::PerThread(dev), [&]() {
236 R_lcl(lclRow, v) -= A_x;
237 });
238
239 } // end for numVectors
240 }
241 });
242 }
243};
244
245template <class SC, class LO, class GO, class NO>
246void localResidual(const CrsMatrix<SC, LO, GO, NO>& A,
250 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
251 const MultiVector<SC, LO, GO, NO>* X_domainmap = nullptr) {
252 using Teuchos::NO_TRANS;
254 ProfilingRegion regionLocalApply("Tpetra::CrsMatrix::localResidual");
255
256 using local_matrix_device_type = typename CrsMatrix<SC, LO, GO, NO>::local_matrix_device_type;
259 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
260
261 local_matrix_device_type A_lcl = A.getLocalMatrixDevice();
262 const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
263 const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
264 local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
265 const_local_view_device_type X_domainmap_lcl;
266
267 const bool debug = ::Tpetra::Details::Behavior::debug();
268 if (debug) {
269 TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getNumVectors() != R.getNumVectors(), std::runtime_error,
270 "X.getNumVectors() = " << X_colmap.getNumVectors() << " != "
271 "R.getNumVectors() = "
272 << R.getNumVectors() << ".");
273 TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getLocalLength() !=
274 A.getColMap()->getLocalNumElements(),
275 std::runtime_error,
276 "X has the wrong number of local rows. "
277 "X.getLocalLength() = "
278 << X_colmap.getLocalLength() << " != "
279 "A.getColMap()->getLocalNumElements() = "
280 << A.getColMap()->getLocalNumElements() << ".");
281 TEUCHOS_TEST_FOR_EXCEPTION(R.getLocalLength() !=
282 A.getRowMap()->getLocalNumElements(),
283 std::runtime_error,
284 "R has the wrong number of local rows. "
285 "R.getLocalLength() = "
286 << R.getLocalLength() << " != "
287 "A.getRowMap()->getLocalNumElements() = "
288 << A.getRowMap()->getLocalNumElements() << ".");
289 TEUCHOS_TEST_FOR_EXCEPTION(B.getLocalLength() !=
290 A.getRowMap()->getLocalNumElements(),
291 std::runtime_error,
292 "B has the wrong number of local rows. "
293 "B.getLocalLength() = "
294 << B.getLocalLength() << " != "
295 "A.getRowMap()->getLocalNumElements() = "
296 << A.getRowMap()->getLocalNumElements() << ".");
297
298 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
299 "The matrix A is not "
300 "fill complete. You must call fillComplete() (possibly with "
301 "domain and range Map arguments) without an intervening "
302 "resumeFill() call before you may call this method.");
303 TEUCHOS_TEST_FOR_EXCEPTION(!X_colmap.isConstantStride() || !R.isConstantStride() || !B.isConstantStride(),
304 std::runtime_error, "X, Y and B must be constant stride.");
305 // If the two pointers are NULL, then they don't alias one
306 // another, even though they are equal.
307 TEUCHOS_TEST_FOR_EXCEPTION((X_colmap_lcl.data() == R_lcl.data() && X_colmap_lcl.data() != nullptr) ||
308 (X_colmap_lcl.data() == B_lcl.data() && X_colmap_lcl.data() != nullptr),
309 std::runtime_error, "X, Y and R may not alias one another.");
310 }
311
312 const bool fusedResidual = ::Tpetra::Details::Behavior::fusedResidual();
313 if (!fusedResidual) {
314 SC one = Teuchos::ScalarTraits<SC>::one();
315 SC negone = -one;
316 SC zero = Teuchos::ScalarTraits<SC>::zero();
317 // This is currently a "reference implementation" waiting until Kokkos Kernels provides
318 // a residual kernel.
319 A.localApply(X_colmap, R, Teuchos::NO_TRANS, one, zero);
320 R.update(one, B, negone);
321 return;
322 }
323
324 if (A_lcl.numRows() == 0) {
325 return;
326 }
327
328 int64_t numLocalRows = A_lcl.numRows();
329 int64_t myNnz = A_lcl.nnz();
330
331 int team_size = -1;
332 int vector_length = -1;
333 int64_t rows_per_thread = -1;
334
335 using execution_space = typename CrsMatrix<SC, LO, GO, NO>::execution_space;
336 using policy_type = typename Kokkos::TeamPolicy<execution_space>;
337
338 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
339 int64_t worksets = (B_lcl.extent(0) + rows_per_team - 1) / rows_per_team;
340
341 policy_type policy(1, 1);
342 if (team_size < 0) {
343 policy = policy_type(worksets, Kokkos::AUTO, vector_length);
344 } else {
345 policy = policy_type(worksets, team_size, vector_length);
346 }
347
348 bool is_vector = (X_colmap_lcl.extent(1) == 1);
349
350 if (is_vector) {
351 if (X_domainmap == nullptr) {
352 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, false, false>;
353 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
354 Kokkos::parallel_for("residual-vector", policy, func);
355
356 } else {
357 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
358 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, true, false>;
359 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
360 Kokkos::parallel_for("residual-vector", policy, func);
361 }
362 } else {
363 if (X_domainmap == nullptr) {
364 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, false, false>;
365 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
366 Kokkos::parallel_for("residual-multivector", policy, func);
367
368 } else {
369 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
370 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, true, false>;
371 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
372 Kokkos::parallel_for("residual-multivector", policy, func);
373 }
374 }
375}
376
377template <class SC, class LO, class GO, class NO>
378void localResidualWithCommCompOverlap(const CrsMatrix<SC, LO, GO, NO>& A,
379 MultiVector<SC, LO, GO, NO>& X_colmap,
380 const MultiVector<SC, LO, GO, NO>& B,
381 MultiVector<SC, LO, GO, NO>& R,
382 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
383 const MultiVector<SC, LO, GO, NO>& X_domainmap) {
384 using Teuchos::NO_TRANS;
385 using Teuchos::RCP;
387 using import_type = typename CrsMatrix<SC, LO, GO, NO>::import_type;
388
389 ProfilingRegion regionLocalApply("Tpetra::CrsMatrix::localResidualWithCommCompOverlap");
390
391 using local_matrix_device_type = typename CrsMatrix<SC, LO, GO, NO>::local_matrix_device_type;
392 using local_view_device_type = typename MultiVector<SC, LO, GO, NO>::dual_view_type::t_dev::non_const_type;
393 using const_local_view_device_type = typename MultiVector<SC, LO, GO, NO>::dual_view_type::t_dev::const_type;
394 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
395
396 local_matrix_device_type A_lcl = A.getLocalMatrixDevice();
397 const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
398 const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
399 local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
400 const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);
401 ;
402
403 const bool debug = ::Tpetra::Details::Behavior::debug();
404 if (debug) {
405 TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getNumVectors() != R.getNumVectors(), std::runtime_error,
406 "X.getNumVectors() = " << X_colmap.getNumVectors() << " != "
407 "R.getNumVectors() = "
408 << R.getNumVectors() << ".");
409 TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getLocalLength() !=
410 A.getColMap()->getLocalNumElements(),
411 std::runtime_error,
412 "X has the wrong number of local rows. "
413 "X.getLocalLength() = "
414 << X_colmap.getLocalLength() << " != "
415 "A.getColMap()->getLocalNumElements() = "
416 << A.getColMap()->getLocalNumElements() << ".");
417 TEUCHOS_TEST_FOR_EXCEPTION(R.getLocalLength() !=
418 A.getRowMap()->getLocalNumElements(),
419 std::runtime_error,
420 "R has the wrong number of local rows. "
421 "R.getLocalLength() = "
422 << R.getLocalLength() << " != "
423 "A.getRowMap()->getLocalNumElements() = "
424 << A.getRowMap()->getLocalNumElements() << ".");
425 TEUCHOS_TEST_FOR_EXCEPTION(B.getLocalLength() !=
426 A.getRowMap()->getLocalNumElements(),
427 std::runtime_error,
428 "B has the wrong number of local rows. "
429 "B.getLocalLength() = "
430 << B.getLocalLength() << " != "
431 "A.getRowMap()->getLocalNumElements() = "
432 << A.getRowMap()->getLocalNumElements() << ".");
433
434 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
435 "The matrix A is not "
436 "fill complete. You must call fillComplete() (possibly with "
437 "domain and range Map arguments) without an intervening "
438 "resumeFill() call before you may call this method.");
439 TEUCHOS_TEST_FOR_EXCEPTION(!X_colmap.isConstantStride() || !R.isConstantStride() || !B.isConstantStride(),
440 std::runtime_error, "X, Y and B must be constant stride.");
441 // If the two pointers are NULL, then they don't alias one
442 // another, even though they are equal.
443 TEUCHOS_TEST_FOR_EXCEPTION((X_colmap_lcl.data() == R_lcl.data() && X_colmap_lcl.data() != nullptr) ||
444 (X_colmap_lcl.data() == B_lcl.data() && X_colmap_lcl.data() != nullptr),
445 std::runtime_error, "X, Y and R may not alias one another.");
446 }
447
448 if (A_lcl.numRows() == 0) {
449 return;
450 }
451
452 int64_t numLocalRows = A_lcl.numRows();
453 int64_t myNnz = A_lcl.nnz();
454
455 int team_size = -1;
456 int vector_length = -1;
457 int64_t rows_per_thread = -1;
458
459 using execution_space = typename CrsMatrix<SC, LO, GO, NO>::execution_space;
460 using policy_type = typename Kokkos::TeamPolicy<execution_space>;
461
462 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
463 int64_t worksets = (B_lcl.extent(0) + rows_per_team - 1) / rows_per_team;
464
465 policy_type policy(1, 1);
466 if (team_size < 0) {
467 policy = policy_type(worksets, Kokkos::AUTO, vector_length);
468 } else {
469 policy = policy_type(worksets, team_size, vector_length);
470 }
471
472 bool is_vector = (X_colmap_lcl.extent(1) == 1);
473
474 if (is_vector) {
475 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, true, true>;
476 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
477 Kokkos::parallel_for("residual-vector", policy, func);
478
479 RCP<const import_type> importer = A.getGraph()->getImporter();
480 X_colmap.endImport(X_domainmap, *importer, INSERT, true);
481
482 Kokkos::fence("Tpetra::localResidualWithCommCompOverlap-1");
483
484 using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false>;
485 functor_type2 func2(A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
486 Kokkos::parallel_for("residual-vector-offrank", policy, func2);
487
488 } else {
489 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, true, true>;
490 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
491 Kokkos::parallel_for("residual-multivector", policy, func);
492
493 RCP<const import_type> importer = A.getGraph()->getImporter();
494 X_colmap.endImport(X_domainmap, *importer, INSERT, true);
495
496 Kokkos::fence("Tpetra::localResidualWithCommCompOverlap-2");
497
498 using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true>;
499 functor_type2 func2(A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
500 Kokkos::parallel_for("residual-vector-offrank", policy, func2);
501 }
502}
503
505template <class SC, class LO, class GO, class NO>
510 using Teuchos::RCP;
511 using Teuchos::rcp;
512 using Teuchos::rcp_const_cast;
513 using Teuchos::rcpFromRef;
515
516 const bool debug = ::Tpetra::Details::Behavior::debug();
517 const bool skipCopyAndPermuteIfPossible = ::Tpetra::Details::Behavior::skipCopyAndPermuteIfPossible();
518 const bool overlapCommunicationAndComputation = ::Tpetra::Details::Behavior::overlapCommunicationAndComputation();
519 if (overlapCommunicationAndComputation)
520 TEUCHOS_ASSERT(skipCopyAndPermuteIfPossible);
521
522 // Whether we are using restrictedMode in the import from domain to
523 // column map. Restricted mode skips the copy and permutation of the
524 // local part of X. We are using restrictedMode only when domain and
525 // column map are locally fitted, i.e. when the local indices of
526 // domain and column map match.
527 bool restrictedMode = false;
528
529 const CrsMatrix<SC, LO, GO, NO>* Apt = dynamic_cast<const CrsMatrix<SC, LO, GO, NO>*>(&Aop);
530 if (!Apt) {
531 // If we're not a CrsMatrix, we can't do fusion, so just do apply+update
532 SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
533 Aop.apply(X_in, R_in, Teuchos::NO_TRANS, one, zero);
534 R_in.update(one, B_in, negone);
535 return;
536 }
538
539 using import_type = typename CrsMatrix<SC, LO, GO, NO>::import_type;
540 using export_type = typename CrsMatrix<SC, LO, GO, NO>::export_type;
543 using offset_type = typename graph_type::offset_device_view_type;
544
545 // We treat the case of a replicated MV output specially.
546 const bool R_is_replicated =
547 (!R_in.isDistributed() && A.getComm()->getSize() != 1);
548
549 // It's possible that R is a view of X or B.
550 // We don't try to to detect the more subtle cases (e.g., one is a
551 // subview of the other, but their initial pointers differ). We
552 // only need to do this if this matrix's Import is trivial;
553 // otherwise, we don't actually apply the operator from X into Y.
554
555 RCP<const import_type> importer = A.getGraph()->getImporter();
556 RCP<const export_type> exporter = A.getGraph()->getExporter();
557
558 // Temporary MV for Import operation. After the block of code
559 // below, this will be an (Imported if necessary) column Map MV
560 // ready to give to localApply(...).
562 if (importer.is_null()) {
563 if (!X_in.isConstantStride()) {
564 // Not all sparse mat-vec kernels can handle an input MV with
565 // nonconstant stride correctly, so we have to copy it in that
566 // case into a constant stride MV. To make a constant stride
567 // copy of X_in, we force creation of the column (== domain)
568 // Map MV (if it hasn't already been created, else fetch the
569 // cached copy). This avoids creating a new MV each time.
570 X_colMap = A.getColumnMapMultiVector(X_in, true);
572 // X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
573 } else {
574 // The domain and column Maps are the same, so do the local
575 // multiply using the domain Map input MV X_in.
577 }
578 } else { // need to Import source (multi)vector
579 ProfilingRegion regionImport("Tpetra::CrsMatrix::residual: Import");
580 // We're doing an Import anyway, which will copy the relevant
581 // elements of the domain Map MV X_in into a separate column Map
582 // MV. Thus, we don't have to worry whether X_in is constant
583 // stride.
584 X_colMap = A.getColumnMapMultiVector(X_in);
585
586 // Do we want to use restrictedMode?
587 restrictedMode = skipCopyAndPermuteIfPossible && importer->isLocallyFitted();
588
589 if (debug && restrictedMode) {
590 TEUCHOS_TEST_FOR_EXCEPTION(!importer->getTargetMap()->isLocallyFitted(*importer->getSourceMap()), std::runtime_error,
591 "Source map and target map are not locally fitted, but Tpetra::residual thinks they are.");
592 }
593
594 // Import from the domain Map MV to the column Map MV.
595 X_colMap->beginImport(X_in, *importer, INSERT, restrictedMode);
596 }
597
598 offset_type offsets;
599 if (restrictedMode)
600 A.getCrsGraph()->getLocalOffRankOffsets(offsets);
601
602 // Get a vector for the R_rowMap output residual, handling the
603 // non-constant stride and exporter cases. Since R gets clobbered
604 // we don't need to worry about the data in it
606 if (exporter.is_null()) {
607 if (!R_in.isConstantStride()) {
608 R_rowMap = A.getRowMapMultiVector(R_in);
609 } else {
611 }
612 } else {
613 R_rowMap = A.getRowMapMultiVector(R_in);
614 }
615
616 // Get a vector for the B_rowMap output residual, handling the
617 // non-constant stride and exporter cases
619 if (exporter.is_null()) {
620 if (!B_in.isConstantStride()) {
621 // Do an allocation here. If we need to optimize this later, we can have the matrix
622 // cache this.
623 RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(), B_in.getNumVectors()));
626 } else {
628 }
629 } else {
630 // Do an allocation here. If we need to optimize this later, we can have the matrix
631 // cache this.
632 ProfilingRegion regionExport("Tpetra::CrsMatrix::residual: B Import");
633 RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(), B_in.getNumVectors()));
634 B_rowMapNonConst->doImport(B_in, *exporter, ADD);
636 }
637
638 // If we have a nontrivial Export object, we must perform an
639 // Export. In that case, the local multiply result will go into
640 // the row Map multivector. We don't have to make a
641 // constant-stride version of R_in in this case, because we had to
642 // make a constant stride R_rowMap MV and do an Export anyway.
643 if (!exporter.is_null()) {
644 if (!importer.is_null())
646 if (restrictedMode && !importer.is_null())
647 localResidual(A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
648 else
649 localResidual(A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
650
651 {
652 ProfilingRegion regionExport("Tpetra::CrsMatrix::residual: R Export");
653
654 // Do the Export operation.
655 R_in.doExport(*R_rowMap, *exporter, ADD);
656 }
657 } else { // Don't do an Export: row Map and range Map are the same.
658
659 if (restrictedMode)
660 if (overlapCommunicationAndComputation) {
661 localResidualWithCommCompOverlap(A, *X_colMap, *B_rowMap, *R_rowMap, offsets, X_in);
662 } else {
664 localResidual(A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
665 }
666 else {
667 if (!importer.is_null())
669 localResidual(A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
670 }
671
672 //
673 // If R_in does not have constant stride,
674 // then we can't let the kernel write directly to R_in.
675 // Instead, we have to use the cached row (== range)
676 // Map MV as temporary storage.
677 //
678 if (!R_in.isConstantStride()) {
679 // We need to be sure to do a copy out in this case.
681 }
682 }
683
684 // If the range Map is a locally replicated Map, sum up
685 // contributions from each process.
686 if (R_is_replicated) {
687 ProfilingRegion regionReduce("Tpetra::CrsMatrix::residual: Reduce Y");
688 R_in.reduce();
689 }
690}
691
692} // namespace Details
693} // namespace Tpetra
694
695#endif // TPETRA_DETAILS_RESIDUAL_HPP
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Struct that holds views of the contents of a CrsMatrix.
typename device_type::execution_space execution_space
The Kokkos execution space.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Import< LocalOrdinal, GlobalOrdinal, Node > import_type
The Import specialization suitable for this CrsMatrix specialization.
static bool fusedResidual()
Fusing SpMV and update in residual instead of using 2 kernel launches. Fusing kernels implies that no...
static bool debug()
Whether Tpetra is in debug mode.
static bool overlapCommunicationAndComputation()
Overlap communication and computation.
static bool skipCopyAndPermuteIfPossible()
Skip copyAndPermute if possible.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
One or more distributed dense vectors.
Implementation details of Tpetra.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
@ ADD
Sum new values.
@ INSERT
Insert new values that don't currently exist.
Functor for computing the residual.
Functor for computing R -= A_offRank*X_colmap.