Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_residual.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_RESIDUAL_HPP
11#define TPETRA_DETAILS_RESIDUAL_HPP
12
13#include "TpetraCore_config.h"
14#include "Tpetra_CrsMatrix.hpp"
15#include "Tpetra_LocalCrsMatrixOperator.hpp"
16#include "Tpetra_MultiVector.hpp"
17#include "Teuchos_RCP.hpp"
18#include "Teuchos_ScalarTraits.hpp"
21#include "KokkosSparse_spmv_impl.hpp"
22
28
29namespace Tpetra {
30namespace Details {
31
38template <class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV, bool restrictedMode, bool skipOffRank>
40 using execution_space = typename AMatrix::execution_space;
41 using LO = typename AMatrix::non_const_ordinal_type;
42 using value_type = typename AMatrix::non_const_value_type;
43 using team_policy = typename Kokkos::TeamPolicy<execution_space>;
44 using team_member = typename team_policy::member_type;
45 using ATV = KokkosKernels::ArithTraits<value_type>;
46
47 AMatrix A_lcl;
48 ConstMV X_colmap_lcl;
49 ConstMV B_lcl;
50 MV R_lcl;
51 int rows_per_team;
52 Offsets offsets;
53 ConstMV X_domainmap_lcl;
54
55 LocalResidualFunctor(const AMatrix& A_lcl_,
57 const ConstMV& B_lcl_,
58 const MV& R_lcl_,
59 const int rows_per_team_,
60 const Offsets& offsets_,
62 : A_lcl(A_lcl_)
63 , X_colmap_lcl(X_colmap_lcl_)
64 , B_lcl(B_lcl_)
65 , R_lcl(R_lcl_)
66 , rows_per_team(rows_per_team_)
67 , offsets(offsets_)
68 , X_domainmap_lcl(X_domainmap_lcl_) {}
69
71 void operator()(const team_member& dev) const {
72 Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&](const LO& loop) {
73 const LO lclRow = static_cast<LO>(dev.league_rank()) * rows_per_team + loop;
74
75 if (lclRow >= A_lcl.numRows()) {
76 return;
77 }
78
79 if (!is_MV) { // MultiVectors only have a single column
80
81 value_type A_x = ATV::zero();
82
83 if (!restrictedMode) {
84 const auto A_row = A_lcl.rowConst(lclRow);
85 const LO row_length = static_cast<LO>(A_row.length);
86
87 Kokkos::parallel_reduce(
88 Kokkos::ThreadVectorRange(dev, row_length), [&](const LO iEntry, value_type& lsum) {
89 const auto A_val = A_row.value(iEntry);
90 lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry), 0);
91 },
92 A_x);
93
94 } else {
95 const LO offRankOffset = offsets(lclRow);
96 const size_t start = A_lcl.graph.row_map(lclRow);
97 const size_t end = A_lcl.graph.row_map(lclRow + 1);
98
99 Kokkos::parallel_reduce(
100 Kokkos::ThreadVectorRange(dev, start, end), [&](const LO iEntry, value_type& lsum) {
101 const auto A_val = A_lcl.values(iEntry);
102 const auto lclCol = A_lcl.graph.entries(iEntry);
103 if (iEntry < offRankOffset)
104 lsum += A_val * X_domainmap_lcl(lclCol, 0);
105 else if (!skipOffRank)
106 lsum += A_val * X_colmap_lcl(lclCol, 0);
107 },
108 A_x);
109 }
110
111 Kokkos::single(Kokkos::PerThread(dev), [&]() {
112 R_lcl(lclRow, 0) = B_lcl(lclRow, 0) - A_x;
113 });
114 } else { // MultiVectors have more than one column
115
116 const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
117
118 for (LO v = 0; v < numVectors; v++) {
119 value_type A_x = ATV::zero();
120
121 if (!restrictedMode) {
122 const auto A_row = A_lcl.rowConst(lclRow);
123 const LO row_length = static_cast<LO>(A_row.length);
124
125 Kokkos::parallel_reduce(
126 Kokkos::ThreadVectorRange(dev, row_length), [&](const LO iEntry, value_type& lsum) {
127 const auto A_val = A_row.value(iEntry);
128 lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry), v);
129 },
130 A_x);
131 } else {
132 const LO offRankOffset = offsets(lclRow);
133 const size_t start = A_lcl.graph.row_map(lclRow);
134 const size_t end = A_lcl.graph.row_map(lclRow + 1);
135
136 Kokkos::parallel_reduce(
137 Kokkos::ThreadVectorRange(dev, start, end), [&](const LO iEntry, value_type& lsum) {
138 const auto A_val = A_lcl.values(iEntry);
139 const auto lclCol = A_lcl.graph.entries(iEntry);
140 if (iEntry < offRankOffset)
141 lsum += A_val * X_domainmap_lcl(lclCol, v);
142 else if (!skipOffRank)
143 lsum += A_val * X_colmap_lcl(lclCol, v);
144 },
145 A_x);
146 }
147
148 Kokkos::single(Kokkos::PerThread(dev), [&]() {
149 R_lcl(lclRow, v) = B_lcl(lclRow, v) - A_x;
150 });
151
152 } // end for numVectors
153 }
154 }); // end parallel_for TeamThreadRange
155 }
156};
157
159template <class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV>
161 using execution_space = typename AMatrix::execution_space;
162 using LO = typename AMatrix::non_const_ordinal_type;
163 using value_type = typename AMatrix::non_const_value_type;
164 using team_policy = typename Kokkos::TeamPolicy<execution_space>;
165 using team_member = typename team_policy::member_type;
166 using ATV = KokkosKernels::ArithTraits<value_type>;
167
168 AMatrix A_lcl;
169 ConstMV X_colmap_lcl;
170 MV R_lcl;
171 int rows_per_team;
172 Offsets offsets;
173
174 OffRankUpdateFunctor(const AMatrix& A_lcl_,
175 const ConstMV& X_colmap_lcl_,
176 const MV& R_lcl_,
177 const int rows_per_team_,
178 const Offsets& offsets_)
179 : A_lcl(A_lcl_)
180 , X_colmap_lcl(X_colmap_lcl_)
181 , R_lcl(R_lcl_)
182 , rows_per_team(rows_per_team_)
183 , offsets(offsets_) {}
184
186 void operator()(const team_member& dev) const {
187 Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&](const LO& loop) {
188 const LO lclRow = static_cast<LO>(dev.league_rank()) * rows_per_team + loop;
189
190 if (lclRow >= A_lcl.numRows()) {
191 return;
192 }
193
194 const LO offRankOffset = offsets(lclRow);
195 const size_t end = A_lcl.graph.row_map(lclRow + 1);
196
197 if (!is_MV) { // MultiVectors only have a single column
198
199 value_type A_x = ATV::zero();
200
201 Kokkos::parallel_reduce(
202 Kokkos::ThreadVectorRange(dev, offRankOffset, end), [&](const LO iEntry, value_type& lsum) {
203 const auto A_val = A_lcl.values(iEntry);
204 const auto lclCol = A_lcl.graph.entries(iEntry);
205 lsum += A_val * X_colmap_lcl(lclCol, 0);
206 },
207 A_x);
208
209 Kokkos::single(Kokkos::PerThread(dev), [&]() {
210 R_lcl(lclRow, 0) -= A_x;
211 });
212 } else { // MultiVectors have more than one column
213
214 const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
215
216 for (LO v = 0; v < numVectors; v++) {
217 value_type A_x = ATV::zero();
218
219 Kokkos::parallel_reduce(
220 Kokkos::ThreadVectorRange(dev, offRankOffset, end), [&](const LO iEntry, value_type& lsum) {
221 const auto A_val = A_lcl.values(iEntry);
222 const auto lclCol = A_lcl.graph.entries(iEntry);
223 lsum += A_val * X_colmap_lcl(lclCol, v);
224 },
225 A_x);
226
227 Kokkos::single(Kokkos::PerThread(dev), [&]() {
228 R_lcl(lclRow, v) -= A_x;
229 });
230
231 } // end for numVectors
232 }
233 });
234 }
235};
236
237template <class SC, class LO, class GO, class NO>
238void localResidual(const CrsMatrix<SC, LO, GO, NO>& A,
242 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
243 const MultiVector<SC, LO, GO, NO>* X_domainmap = nullptr) {
244 using Teuchos::NO_TRANS;
246 ProfilingRegion regionLocalApply("Tpetra::CrsMatrix::localResidual");
247
248 using local_matrix_device_type = typename CrsMatrix<SC, LO, GO, NO>::local_matrix_device_type;
251 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
252
253 local_matrix_device_type A_lcl = A.getLocalMatrixDevice();
254 const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
255 const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
256 local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
257 const_local_view_device_type X_domainmap_lcl;
258
259 const bool debug = ::Tpetra::Details::Behavior::debug();
260 if (debug) {
261 TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getNumVectors() != R.getNumVectors(), std::runtime_error,
262 "X.getNumVectors() = " << X_colmap.getNumVectors() << " != "
263 "R.getNumVectors() = "
264 << R.getNumVectors() << ".");
265 TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getLocalLength() !=
266 A.getColMap()->getLocalNumElements(),
267 std::runtime_error,
268 "X has the wrong number of local rows. "
269 "X.getLocalLength() = "
270 << X_colmap.getLocalLength() << " != "
271 "A.getColMap()->getLocalNumElements() = "
272 << A.getColMap()->getLocalNumElements() << ".");
273 TEUCHOS_TEST_FOR_EXCEPTION(R.getLocalLength() !=
274 A.getRowMap()->getLocalNumElements(),
275 std::runtime_error,
276 "R has the wrong number of local rows. "
277 "R.getLocalLength() = "
278 << R.getLocalLength() << " != "
279 "A.getRowMap()->getLocalNumElements() = "
280 << A.getRowMap()->getLocalNumElements() << ".");
281 TEUCHOS_TEST_FOR_EXCEPTION(B.getLocalLength() !=
282 A.getRowMap()->getLocalNumElements(),
283 std::runtime_error,
284 "B has the wrong number of local rows. "
285 "B.getLocalLength() = "
286 << B.getLocalLength() << " != "
287 "A.getRowMap()->getLocalNumElements() = "
288 << A.getRowMap()->getLocalNumElements() << ".");
289
290 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
291 "The matrix A is not "
292 "fill complete. You must call fillComplete() (possibly with "
293 "domain and range Map arguments) without an intervening "
294 "resumeFill() call before you may call this method.");
295 TEUCHOS_TEST_FOR_EXCEPTION(!X_colmap.isConstantStride() || !R.isConstantStride() || !B.isConstantStride(),
296 std::runtime_error, "X, Y and B must be constant stride.");
297 // If the two pointers are NULL, then they don't alias one
298 // another, even though they are equal.
299 TEUCHOS_TEST_FOR_EXCEPTION((X_colmap_lcl.data() == R_lcl.data() && X_colmap_lcl.data() != nullptr) ||
300 (X_colmap_lcl.data() == B_lcl.data() && X_colmap_lcl.data() != nullptr),
301 std::runtime_error, "X, Y and R may not alias one another.");
302 }
303
304 const bool fusedResidual = ::Tpetra::Details::Behavior::fusedResidual();
305 if (!fusedResidual) {
306 SC one = Teuchos::ScalarTraits<SC>::one();
307 SC negone = -one;
308 SC zero = Teuchos::ScalarTraits<SC>::zero();
309 // This is currently a "reference implementation" waiting until Kokkos Kernels provides
310 // a residual kernel.
311 A.localApply(X_colmap, R, Teuchos::NO_TRANS, one, zero);
312 R.update(one, B, negone);
313 return;
314 }
315
316 if (A_lcl.numRows() == 0) {
317 return;
318 }
319
320 int64_t numLocalRows = A_lcl.numRows();
321 int64_t myNnz = A_lcl.nnz();
322
323 int team_size = -1;
324 int vector_length = -1;
325 int64_t rows_per_thread = -1;
326
327 using execution_space = typename CrsMatrix<SC, LO, GO, NO>::execution_space;
328 using policy_type = typename Kokkos::TeamPolicy<execution_space>;
329
330 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
331 int64_t worksets = (B_lcl.extent(0) + rows_per_team - 1) / rows_per_team;
332
333 policy_type policy(1, 1);
334 if (team_size < 0) {
335 policy = policy_type(worksets, Kokkos::AUTO, vector_length);
336 } else {
337 policy = policy_type(worksets, team_size, vector_length);
338 }
339
340 bool is_vector = (X_colmap_lcl.extent(1) == 1);
341
342 if (is_vector) {
343 if (X_domainmap == nullptr) {
344 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, false, false>;
345 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
346 Kokkos::parallel_for("residual-vector", policy, func);
347
348 } else {
349 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
350 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, true, false>;
351 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
352 Kokkos::parallel_for("residual-vector", policy, func);
353 }
354 } else {
355 if (X_domainmap == nullptr) {
356 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, false, false>;
357 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
358 Kokkos::parallel_for("residual-multivector", policy, func);
359
360 } else {
361 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
362 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, true, false>;
363 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
364 Kokkos::parallel_for("residual-multivector", policy, func);
365 }
366 }
367}
368
369template <class SC, class LO, class GO, class NO>
370void localResidualWithCommCompOverlap(const CrsMatrix<SC, LO, GO, NO>& A,
371 MultiVector<SC, LO, GO, NO>& X_colmap,
372 const MultiVector<SC, LO, GO, NO>& B,
373 MultiVector<SC, LO, GO, NO>& R,
374 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
375 const MultiVector<SC, LO, GO, NO>& X_domainmap) {
376 using Teuchos::NO_TRANS;
377 using Teuchos::RCP;
379 using import_type = typename CrsMatrix<SC, LO, GO, NO>::import_type;
380
381 ProfilingRegion regionLocalApply("Tpetra::CrsMatrix::localResidualWithCommCompOverlap");
382
383 using local_matrix_device_type = typename CrsMatrix<SC, LO, GO, NO>::local_matrix_device_type;
384 using local_view_device_type = typename MultiVector<SC, LO, GO, NO>::dual_view_type::t_dev::non_const_type;
385 using const_local_view_device_type = typename MultiVector<SC, LO, GO, NO>::dual_view_type::t_dev::const_type;
386 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
387
388 local_matrix_device_type A_lcl = A.getLocalMatrixDevice();
389 const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
390 const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
391 local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
392 const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);
393 ;
394
395 const bool debug = ::Tpetra::Details::Behavior::debug();
396 if (debug) {
397 TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getNumVectors() != R.getNumVectors(), std::runtime_error,
398 "X.getNumVectors() = " << X_colmap.getNumVectors() << " != "
399 "R.getNumVectors() = "
400 << R.getNumVectors() << ".");
401 TEUCHOS_TEST_FOR_EXCEPTION(X_colmap.getLocalLength() !=
402 A.getColMap()->getLocalNumElements(),
403 std::runtime_error,
404 "X has the wrong number of local rows. "
405 "X.getLocalLength() = "
406 << X_colmap.getLocalLength() << " != "
407 "A.getColMap()->getLocalNumElements() = "
408 << A.getColMap()->getLocalNumElements() << ".");
409 TEUCHOS_TEST_FOR_EXCEPTION(R.getLocalLength() !=
410 A.getRowMap()->getLocalNumElements(),
411 std::runtime_error,
412 "R has the wrong number of local rows. "
413 "R.getLocalLength() = "
414 << R.getLocalLength() << " != "
415 "A.getRowMap()->getLocalNumElements() = "
416 << A.getRowMap()->getLocalNumElements() << ".");
417 TEUCHOS_TEST_FOR_EXCEPTION(B.getLocalLength() !=
418 A.getRowMap()->getLocalNumElements(),
419 std::runtime_error,
420 "B has the wrong number of local rows. "
421 "B.getLocalLength() = "
422 << B.getLocalLength() << " != "
423 "A.getRowMap()->getLocalNumElements() = "
424 << A.getRowMap()->getLocalNumElements() << ".");
425
426 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
427 "The matrix A is not "
428 "fill complete. You must call fillComplete() (possibly with "
429 "domain and range Map arguments) without an intervening "
430 "resumeFill() call before you may call this method.");
431 TEUCHOS_TEST_FOR_EXCEPTION(!X_colmap.isConstantStride() || !R.isConstantStride() || !B.isConstantStride(),
432 std::runtime_error, "X, Y and B must be constant stride.");
433 // If the two pointers are NULL, then they don't alias one
434 // another, even though they are equal.
435 TEUCHOS_TEST_FOR_EXCEPTION((X_colmap_lcl.data() == R_lcl.data() && X_colmap_lcl.data() != nullptr) ||
436 (X_colmap_lcl.data() == B_lcl.data() && X_colmap_lcl.data() != nullptr),
437 std::runtime_error, "X, Y and R may not alias one another.");
438 }
439
440 if (A_lcl.numRows() == 0) {
441 return;
442 }
443
444 int64_t numLocalRows = A_lcl.numRows();
445 int64_t myNnz = A_lcl.nnz();
446
447 int team_size = -1;
448 int vector_length = -1;
449 int64_t rows_per_thread = -1;
450
451 using execution_space = typename CrsMatrix<SC, LO, GO, NO>::execution_space;
452 using policy_type = typename Kokkos::TeamPolicy<execution_space>;
453
454 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
455 int64_t worksets = (B_lcl.extent(0) + rows_per_team - 1) / rows_per_team;
456
457 policy_type policy(1, 1);
458 if (team_size < 0) {
459 policy = policy_type(worksets, Kokkos::AUTO, vector_length);
460 } else {
461 policy = policy_type(worksets, team_size, vector_length);
462 }
463
464 bool is_vector = (X_colmap_lcl.extent(1) == 1);
465
466 if (is_vector) {
467 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false, true, true>;
468 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
469 Kokkos::parallel_for("residual-vector", policy, func);
470
471 RCP<const import_type> importer = A.getGraph()->getImporter();
472 X_colmap.endImport(X_domainmap, *importer, INSERT, true);
473
474 Kokkos::fence("Tpetra::localResidualWithCommCompOverlap-1");
475
476 using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, false>;
477 functor_type2 func2(A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
478 Kokkos::parallel_for("residual-vector-offrank", policy, func2);
479
480 } else {
481 using functor_type = LocalResidualFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true, true, true>;
482 functor_type func(A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
483 Kokkos::parallel_for("residual-multivector", policy, func);
484
485 RCP<const import_type> importer = A.getGraph()->getImporter();
486 X_colmap.endImport(X_domainmap, *importer, INSERT, true);
487
488 Kokkos::fence("Tpetra::localResidualWithCommCompOverlap-2");
489
490 using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type, local_view_device_type, const_local_view_device_type, offset_type, true>;
491 functor_type2 func2(A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
492 Kokkos::parallel_for("residual-vector-offrank", policy, func2);
493 }
494}
495
497template <class SC, class LO, class GO, class NO>
502 using Teuchos::RCP;
503 using Teuchos::rcp;
504 using Teuchos::rcp_const_cast;
505 using Teuchos::rcpFromRef;
507
508 const bool debug = ::Tpetra::Details::Behavior::debug();
509 const bool skipCopyAndPermuteIfPossible = ::Tpetra::Details::Behavior::skipCopyAndPermuteIfPossible();
510 const bool overlapCommunicationAndComputation = ::Tpetra::Details::Behavior::overlapCommunicationAndComputation();
511 if (overlapCommunicationAndComputation)
512 TEUCHOS_ASSERT(skipCopyAndPermuteIfPossible);
513
514 // Whether we are using restrictedMode in the import from domain to
515 // column map. Restricted mode skips the copy and permutation of the
516 // local part of X. We are using restrictedMode only when domain and
517 // column map are locally fitted, i.e. when the local indices of
518 // domain and column map match.
519 bool restrictedMode = false;
520
521 const CrsMatrix<SC, LO, GO, NO>* Apt = dynamic_cast<const CrsMatrix<SC, LO, GO, NO>*>(&Aop);
522 if (!Apt) {
523 // If we're not a CrsMatrix, we can't do fusion, so just do apply+update
524 SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
525 Aop.apply(X_in, R_in, Teuchos::NO_TRANS, one, zero);
526 R_in.update(one, B_in, negone);
527 return;
528 }
530
531 using import_type = typename CrsMatrix<SC, LO, GO, NO>::import_type;
532 using export_type = typename CrsMatrix<SC, LO, GO, NO>::export_type;
535 using offset_type = typename graph_type::offset_device_view_type;
536
537 // We treat the case of a replicated MV output specially.
538 const bool R_is_replicated =
539 (!R_in.isDistributed() && A.getComm()->getSize() != 1);
540
541 // It's possible that R is a view of X or B.
542 // We don't try to to detect the more subtle cases (e.g., one is a
543 // subview of the other, but their initial pointers differ). We
544 // only need to do this if this matrix's Import is trivial;
545 // otherwise, we don't actually apply the operator from X into Y.
546
547 RCP<const import_type> importer = A.getGraph()->getImporter();
548 RCP<const export_type> exporter = A.getGraph()->getExporter();
549
550 // Temporary MV for Import operation. After the block of code
551 // below, this will be an (Imported if necessary) column Map MV
552 // ready to give to localApply(...).
554 if (importer.is_null()) {
555 if (!X_in.isConstantStride()) {
556 // Not all sparse mat-vec kernels can handle an input MV with
557 // nonconstant stride correctly, so we have to copy it in that
558 // case into a constant stride MV. To make a constant stride
559 // copy of X_in, we force creation of the column (== domain)
560 // Map MV (if it hasn't already been created, else fetch the
561 // cached copy). This avoids creating a new MV each time.
562 X_colMap = A.getColumnMapMultiVector(X_in, true);
564 // X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
565 } else {
566 // The domain and column Maps are the same, so do the local
567 // multiply using the domain Map input MV X_in.
569 }
570 } else { // need to Import source (multi)vector
571 ProfilingRegion regionImport("Tpetra::CrsMatrix::residual: Import");
572 // We're doing an Import anyway, which will copy the relevant
573 // elements of the domain Map MV X_in into a separate column Map
574 // MV. Thus, we don't have to worry whether X_in is constant
575 // stride.
576 X_colMap = A.getColumnMapMultiVector(X_in);
577
578 // Do we want to use restrictedMode?
579 restrictedMode = skipCopyAndPermuteIfPossible && importer->isLocallyFitted();
580
581 if (debug && restrictedMode) {
582 TEUCHOS_TEST_FOR_EXCEPTION(!importer->getTargetMap()->isLocallyFitted(*importer->getSourceMap()), std::runtime_error,
583 "Source map and target map are not locally fitted, but Tpetra::residual thinks they are.");
584 }
585
586 // Import from the domain Map MV to the column Map MV.
587 X_colMap->beginImport(X_in, *importer, INSERT, restrictedMode);
588 }
589
590 offset_type offsets;
591 if (restrictedMode)
592 A.getCrsGraph()->getLocalOffRankOffsets(offsets);
593
594 // Get a vector for the R_rowMap output residual, handling the
595 // non-constant stride and exporter cases. Since R gets clobbered
596 // we don't need to worry about the data in it
598 if (exporter.is_null()) {
599 if (!R_in.isConstantStride()) {
600 R_rowMap = A.getRowMapMultiVector(R_in);
601 } else {
603 }
604 } else {
605 R_rowMap = A.getRowMapMultiVector(R_in);
606 }
607
608 // Get a vector for the B_rowMap output residual, handling the
609 // non-constant stride and exporter cases
611 if (exporter.is_null()) {
612 if (!B_in.isConstantStride()) {
613 // Do an allocation here. If we need to optimize this later, we can have the matrix
614 // cache this.
615 RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(), B_in.getNumVectors()));
618 } else {
620 }
621 } else {
622 // Do an allocation here. If we need to optimize this later, we can have the matrix
623 // cache this.
624 ProfilingRegion regionExport("Tpetra::CrsMatrix::residual: B Import");
625 RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(), B_in.getNumVectors()));
626 B_rowMapNonConst->doImport(B_in, *exporter, ADD);
628 }
629
630 // If we have a nontrivial Export object, we must perform an
631 // Export. In that case, the local multiply result will go into
632 // the row Map multivector. We don't have to make a
633 // constant-stride version of R_in in this case, because we had to
634 // make a constant stride R_rowMap MV and do an Export anyway.
635 if (!exporter.is_null()) {
636 if (!importer.is_null())
638 if (restrictedMode && !importer.is_null())
639 localResidual(A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
640 else
641 localResidual(A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
642
643 {
644 ProfilingRegion regionExport("Tpetra::CrsMatrix::residual: R Export");
645
646 // Do the Export operation.
647 R_in.doExport(*R_rowMap, *exporter, ADD);
648 }
649 } else { // Don't do an Export: row Map and range Map are the same.
650
651 if (restrictedMode)
652 if (overlapCommunicationAndComputation) {
653 localResidualWithCommCompOverlap(A, *X_colMap, *B_rowMap, *R_rowMap, offsets, X_in);
654 } else {
656 localResidual(A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
657 }
658 else {
659 if (!importer.is_null())
661 localResidual(A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
662 }
663
664 //
665 // If R_in does not have constant stride,
666 // then we can't let the kernel write directly to R_in.
667 // Instead, we have to use the cached row (== range)
668 // Map MV as temporary storage.
669 //
670 if (!R_in.isConstantStride()) {
671 // We need to be sure to do a copy out in this case.
673 }
674 }
675
676 // If the range Map is a locally replicated Map, sum up
677 // contributions from each process.
678 if (R_is_replicated) {
679 ProfilingRegion regionReduce("Tpetra::CrsMatrix::residual: Reduce Y");
680 R_in.reduce();
681 }
682}
683
684} // namespace Details
685} // namespace Tpetra
686
687#endif // TPETRA_DETAILS_RESIDUAL_HPP
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
typename device_type::execution_space execution_space
The Kokkos execution space.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Import< LocalOrdinal, GlobalOrdinal, Node > import_type
The Import specialization suitable for this CrsMatrix specialization.
Struct that holds views of the contents of a CrsMatrix.
static bool fusedResidual()
Fusing SpMV and update in residual instead of using 2 kernel launches. Fusing kernels implies that no...
static bool debug()
Whether Tpetra is in debug mode.
static bool overlapCommunicationAndComputation()
Overlap communication and computation.
static bool skipCopyAndPermuteIfPossible()
Skip copyAndPermute if possible.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
One or more distributed dense vectors.
Implementation details of Tpetra.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
@ ADD
Sum new values.
@ INSERT
Insert new values that don't currently exist.
Functor for computing the residual.
Functor for computing R -= A_offRank*X_colmap.