Ifpack2 Templated Preconditioning Package Version 1.0
Loading...
Searching...
No Matches
Ifpack2_Container_def.hpp
1// @HEADER
2// *****************************************************************************
3// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4//
5// Copyright 2009 NTESS and the Ifpack2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef IFPACK2_CONTAINER_DEF_HPP
11#define IFPACK2_CONTAINER_DEF_HPP
12
15#include <Teuchos_Time.hpp>
16
17namespace Ifpack2 {
18
19// Implementation of Ifpack2::Container
20
21template <class MatrixType>
23 const Teuchos::RCP<const row_matrix_type>& matrix,
24 const Teuchos::Array<Teuchos::Array<LO>>& partitions,
25 bool pointIndexed)
26 : inputMatrix_(matrix)
27 , inputCrsMatrix_(Teuchos::rcp_dynamic_cast<const crs_matrix_type>(inputMatrix_))
28 , inputBlockMatrix_(Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(inputMatrix_))
29 , pointIndexed_(pointIndexed)
30 , IsInitialized_(false)
31 , IsComputed_(false) {
32 using Teuchos::Array;
33 using Teuchos::ArrayView;
34 using Teuchos::Comm;
35 using Teuchos::Ptr;
36 using Teuchos::RCP;
37 NumLocalRows_ = inputMatrix_->getLocalNumRows();
38 NumGlobalRows_ = inputMatrix_->getGlobalNumRows();
39 NumGlobalNonzeros_ = inputMatrix_->getGlobalNumEntries();
40 IsParallel_ = inputMatrix_->getRowMap()->getComm()->getSize() != 1;
42 if (hasBlockCrs_)
43 bcrsBlockSize_ = inputBlockMatrix_->getBlockSize();
44 else
48 else
50 setBlockSizes(partitions);
51 // Sanity check the partitions
53 // Check whether the input set of local row indices is correct.
54 const map_type& rowMap = *inputMatrix_->getRowMap();
55 for (int i = 0; i < numBlocks_; i++) {
56 Teuchos::ArrayView<const LO> blockRows = getBlockRows(i);
57 for (LO j = 0; j < blockSizes_[i]; j++) {
58 LO row = blockRows[j];
59 if (pointIndexed) {
60 // convert the point row to the corresponding block row
61 row /= bcrsBlockSize_;
62 }
63 TEUCHOS_TEST_FOR_EXCEPTION(
64 !rowMap.isNodeLocalElement(row),
65 std::invalid_argument,
66 "Ifpack2::Container: "
67 "On process "
68 << rowMap.getComm()->getRank() << " of "
69 << rowMap.getComm()->getSize() << ", in the given set of local row "
70 "indices blockRows = "
71 << Teuchos::toString(blockRows) << ", the following "
72 "entries is not valid local row index on the calling process: "
73 << row << ".");
74 }
75 }
76 }
77}
78
79template <class MatrixType>
82
83template <class MatrixType>
84Teuchos::ArrayView<const typename MatrixType::local_ordinal_type>
86 return Teuchos::ArrayView<const LO>(&blockRows_[blockOffsets_[blockIndex]], blockSizes_[blockIndex]);
87}
88
89template <class MatrixType>
90void Container<MatrixType>::setBlockSizes(const Teuchos::Array<Teuchos::Array<LO>>& partitions) {
91 // First, create a grand total of all the rows in all the blocks
92 // Note: If partitioner allowed overlap, this could be greater than the # of local rows
93 LO totalBlockRows = 0;
94 numBlocks_ = partitions.size();
95 blockSizes_.resize(numBlocks_);
96 blockOffsets_.resize(numBlocks_);
97 maxBlockSize_ = 0;
98 for (int i = 0; i < numBlocks_; i++) {
99 LO rowsInBlock = partitions[i].size();
100 blockSizes_[i] = rowsInBlock;
101 blockOffsets_[i] = totalBlockRows;
102 totalBlockRows += rowsInBlock;
103 maxBlockSize_ = std::max(maxBlockSize_, rowsInBlock * scalarsPerRow_);
104 }
105 blockRows_.resize(totalBlockRows);
106 // set blockRows_: each entry is the partition/block of the row
107 LO iter = 0;
108 for (int i = 0; i < numBlocks_; i++) {
109 for (int j = 0; j < blockSizes_[i]; j++) {
110 blockRows_[iter++] = partitions[i][j];
111 }
112 }
113}
114
115template <class MatrixType>
117 if (Diag_.is_null()) {
118 Diag_ = rcp(new vector_type(inputMatrix_->getDomainMap()));
119 inputMatrix_->getLocalDiagCopy(*Diag_);
120 }
121}
122
123template <class MatrixType>
125 return IsInitialized_;
126}
127
128template <class MatrixType>
130 return IsComputed_;
131}
132
133template <class MatrixType>
135 applyMV(const mv_type& X, mv_type& Y) const {
136 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
137}
138
139template <class MatrixType>
141 weightedApplyMV(const mv_type& X, mv_type& Y, vector_type& W) const {
142 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
143}
144
145template <class MatrixType>
147 getName() {
148 return "Generic";
149}
150
151template <class MatrixType>
153 SC dampingFactor, LO i) const {
154 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
155}
156
157template <class MatrixType>
158void Container<MatrixType>::DoJacobi(ConstHostView X, HostView Y, SC dampingFactor) const {
159 using STS = Teuchos::ScalarTraits<ISC>;
160 const ISC one = STS::one();
161 // use blockRows_ and blockSizes_
162 size_t numVecs = X.extent(1);
163 // Non-overlapping Jacobi
164 for (LO i = 0; i < numBlocks_; i++) {
165 // may happen that a partition is empty
166 if (blockSizes_[i] != 1 || hasBlockCrs_) {
167 if (blockSizes_[i] == 0)
168 continue;
169 apply(X, Y, i, Teuchos::NO_TRANS, dampingFactor, one);
170 } else // singleton, can't access Containers_[i] as it was never filled and may be null.
171 {
172 LO LRID = blockRows_[blockOffsets_[i]];
173 getMatDiag();
174 auto diagView = Diag_->getLocalViewHost(Tpetra::Access::ReadOnly);
175 ISC d = one * (static_cast<ISC>(dampingFactor)) / diagView(LRID, 0);
176 for (size_t nv = 0; nv < numVecs; nv++) {
177 ISC x = X(LRID, nv);
178 Y(LRID, nv) += x * d;
179 }
180 }
181 }
182}
183
184template <class MatrixType>
185void Container<MatrixType>::DoOverlappingJacobi(ConstHostView X, HostView Y, ConstHostView W, SC dampingFactor, bool nonsymScaling) const {
186 using STS = Teuchos::ScalarTraits<SC>;
187 // Overlapping Jacobi
188 for (LO i = 0; i < numBlocks_; i++) {
189 // may happen that a partition is empty
190 if (blockSizes_[i] == 0)
191 continue;
192 if (blockSizes_[i] != 1) {
193 if (!nonsymScaling)
194 weightedApply(X, Y, W, i, Teuchos::NO_TRANS, dampingFactor, STS::one());
195 else {
196 // A crummy way of doing nonsymmetric scaling. We effectively
197 // first reverse scale x, which later gets scaled inside weightedApply
198 // so the net effect is that x is not scaled.
199 // This was done to keep using weightedApply() that is defined in
200 // many spots in the code.
201 HostView tempo("", X.extent(0), X.extent(1));
202 size_t numVecs = X.extent(1);
203 LO bOffset = blockOffsets_[i];
204 for (LO ii = 0; ii < blockSizes_[i]; ii++) {
205 LO LRID = blockRows_[bOffset++];
206 for (size_t jj = 0; jj < numVecs; jj++) tempo(LRID, jj) = X(LRID, jj) / W(LRID, 0);
207 }
208 weightedApply(tempo, Y, W, i, Teuchos::NO_TRANS, dampingFactor, STS::one());
209 }
210 } else // singleton, can't access Containers_[i] as it was never filled and may be null.
211 {
212 const ISC one = STS::one();
213 size_t numVecs = X.extent(1);
214 LO LRID = blockRows_[blockOffsets_[i]];
215 getMatDiag();
216 auto diagView = Diag_->getLocalViewHost(Tpetra::Access::ReadOnly);
217 ISC recip = one / diagView(LRID, 0);
218 ISC wval = W(LRID, 0);
219 ISC combo = wval * recip;
220 ISC d = combo * (static_cast<ISC>(dampingFactor));
221 for (size_t nv = 0; nv < numVecs; nv++) {
222 ISC x = X(LRID, nv);
223 Y(LRID, nv) = x * d + Y(LRID, nv);
224 }
225 }
226 }
227}
228
229// Do Gauss-Seidel with just block i
230// This is used 3 times: once in DoGaussSeidel and twice in DoSGS
231template <class MatrixType, typename LocalScalarType>
233 ConstHostView X, HostView Y, HostView Y2, HostView Resid,
234 SC dampingFactor, LO i) const {
235 using Teuchos::ArrayView;
236 using STS = Teuchos::ScalarTraits<ISC>;
237 size_t numVecs = X.extent(1);
238 const ISC one = STS::one();
239 if (this->blockSizes_[i] == 0)
240 return; // Skip empty partitions
241 if (this->hasBlockCrs_ && !this->pointIndexed_) {
242 // Use efficient blocked version
243 ArrayView<const LO> blockRows = this->getBlockRows(i);
244 const size_t localNumRows = this->blockSizes_[i];
245 using inds_type = typename block_crs_matrix_type::local_inds_host_view_type;
246 using vals_type = typename block_crs_matrix_type::values_host_view_type;
247 for (size_t j = 0; j < localNumRows; j++) {
248 LO row = blockRows[j]; // Containers_[i]->ID (j);
249 vals_type values;
250 inds_type colinds;
251 this->inputBlockMatrix_->getLocalRowView(row, colinds, values);
252 LO numEntries = (LO)colinds.size();
253 for (size_t m = 0; m < numVecs; m++) {
254 for (int localR = 0; localR < this->bcrsBlockSize_; localR++)
255 Resid(row * this->bcrsBlockSize_ + localR, m) = X(row * this->bcrsBlockSize_ + localR, m);
256 for (LO k = 0; k < numEntries; ++k) {
257 const LO col = colinds[k];
258 for (int localR = 0; localR < this->bcrsBlockSize_; localR++) {
259 for (int localC = 0; localC < this->bcrsBlockSize_; localC++) {
260 Resid(row * this->bcrsBlockSize_ + localR, m) -=
261 values[k * this->bcrsBlockSize_ * this->bcrsBlockSize_ + localR + localC * this->bcrsBlockSize_] * Y2(col * this->bcrsBlockSize_ + localC, m);
262 }
263 }
264 }
265 }
266 }
267 // solve with this block
268 //
269 // Note: I'm abusing the ordering information, knowing that X/Y
270 // and Y2 have the same ordering for on-proc unknowns.
271 //
272 // Note: Add flop counts for inverse apply
273 apply(Resid, Y2, i, Teuchos::NO_TRANS, dampingFactor, one);
274 } else if (!this->hasBlockCrs_ && this->blockSizes_[i] == 1) {
275 // singleton, can't access Containers_[i] as it was never filled and may be null.
276 // a singleton calculation (just using matrix diagonal) is exact, all residuals should be zero.
277 LO LRID = this->blockOffsets_[i]; // by definition, a singleton 1 row in block.
278 // Use the KokkosSparse internal matrix for low-overhead values/indices access
279 // But, can only do this if the matrix is accessible directly from host, since it's not a DualView
281 container_exec_space().fence();
282 auto localA = this->inputCrsMatrix_->getLocalMatrixHost();
283 using size_type = typename crs_matrix_type::local_matrix_host_type::size_type;
284 const auto& rowmap = localA.graph.row_map;
285 const auto& entries = localA.graph.entries;
286 const auto& values = localA.values;
287 this->getMatDiag();
288 auto diagView = this->Diag_->getLocalViewHost(Tpetra::Access::ReadOnly);
289 ISC d = (static_cast<ISC>(dampingFactor)) / diagView(LRID, 0);
290 for (size_t m = 0; m < numVecs; m++) {
291 // ISC x = X(LRID, m);
292 ISC r = X(LRID, m);
293 for (size_type k = rowmap(LRID); k < rowmap(LRID + 1); k++) {
294 const LO col = entries(k);
295 r -= values(k) * Y2(col, m);
296 }
297
298 ISC newy = r * d;
299 Y2(LRID, m) += newy;
300 }
301 } else if (!this->inputCrsMatrix_.is_null() &&
302 std::is_same<typename crs_matrix_type::device_type::memory_space, Kokkos::HostSpace>::value) {
303 // Use the KokkosSparse internal matrix for low-overhead values/indices access
304 // But, can only do this if the matrix is accessible directly from host, since it's not a DualView
306 container_exec_space().fence();
307 auto localA = this->inputCrsMatrix_->getLocalMatrixHost();
308 using size_type = typename crs_matrix_type::local_matrix_host_type::size_type;
309 const auto& rowmap = localA.graph.row_map;
310 const auto& entries = localA.graph.entries;
311 const auto& values = localA.values;
312 ArrayView<const LO> blockRows = this->getBlockRows(i);
313 for (size_t j = 0; j < size_t(blockRows.size()); j++) {
314 const LO row = blockRows[j];
315 for (size_t m = 0; m < numVecs; m++) {
316 ISC r = X(row, m);
317 for (size_type k = rowmap(row); k < rowmap(row + 1); k++) {
318 const LO col = entries(k);
319 r -= values(k) * Y2(col, m);
320 }
321 Resid(row, m) = r;
322 }
323 }
324 // solve with this block
325 //
326 // Note: I'm abusing the ordering information, knowing that X/Y
327 // and Y2 have the same ordering for on-proc unknowns.
328 //
329 // Note: Add flop counts for inverse apply
330 apply(Resid, Y2, i, Teuchos::NO_TRANS, dampingFactor, one);
331 } else {
332 // Either a point-indexed block matrix, or a normal row matrix
333 // that doesn't support getLocalMatrix
334 ArrayView<const LO> blockRows = this->getBlockRows(i);
335 for (size_t j = 0; j < size_t(blockRows.size()); j++) {
336 const LO row = blockRows[j];
337 auto rowView = getInputRowView(row);
338 for (size_t m = 0; m < numVecs; m++) {
339 Resid(row, m) = X(row, m);
340 for (size_t k = 0; k < rowView.size(); ++k) {
341 const LO col = rowView.ind(k);
342 Resid(row, m) -= rowView.val(k) * Y2(col, m);
343 }
344 }
345 }
346 // solve with this block
347 //
348 // Note: I'm abusing the ordering information, knowing that X/Y
349 // and Y2 have the same ordering for on-proc unknowns.
350 //
351 // Note: Add flop counts for inverse apply
352 apply(Resid, Y2, i, Teuchos::NO_TRANS, dampingFactor, one);
353 }
354}
355
356template <class MatrixType>
357void Container<MatrixType>::
358 DoGaussSeidel(ConstHostView X, HostView Y, HostView Y2, SC dampingFactor) const {
359 using Teuchos::Array;
360 using Teuchos::ArrayRCP;
361 using Teuchos::ArrayView;
362 using Teuchos::Ptr;
363 using Teuchos::RCP;
364 using Teuchos::rcp;
365 using Teuchos::rcpFromRef;
366 // This function just extracts the diagonal if it hasn't already.
367 getMatDiag();
368 auto numVecs = X.extent(1);
369 // X = RHS, Y = initial guess
370 HostView Resid("", X.extent(0), X.extent(1));
371 for (LO i = 0; i < numBlocks_; i++) {
372 DoGSBlock(X, Y, Y2, Resid, dampingFactor, i);
373 }
374 if (IsParallel_) {
375 auto numMyRows = inputMatrix_->getLocalNumRows();
376 for (size_t m = 0; m < numVecs; ++m) {
377 for (size_t i = 0; i < numMyRows * bcrsBlockSize_; ++i) {
378 Y(i, m) = Y2(i, m);
379 }
380 }
381 }
382}
383
384template <class MatrixType>
385void Container<MatrixType>::
386 DoSGS(ConstHostView X, HostView Y, HostView Y2, SC dampingFactor) const {
387 // X = RHS, Y = initial guess
388 using Teuchos::Array;
389 using Teuchos::ArrayRCP;
390 using Teuchos::ArrayView;
391 using Teuchos::Ptr;
392 using Teuchos::ptr;
393 using Teuchos::RCP;
394 using Teuchos::rcp;
395 using Teuchos::rcpFromRef;
396 auto numVecs = X.extent(1);
397 HostView Resid("", X.extent(0), X.extent(1));
398 // Forward Sweep
399 for (LO i = 0; i < numBlocks_; i++) {
400 DoGSBlock(X, Y, Y2, Resid, dampingFactor, i);
401 }
402 static_assert(std::is_signed<LO>::value,
403 "Local ordinal must be signed (unsigned breaks reverse iteration to 0)");
404 // Reverse Sweep
405 for (LO i = numBlocks_ - 1; i >= 0; --i) {
406 DoGSBlock(X, Y, Y2, Resid, dampingFactor, i);
407 }
408 if (IsParallel_) {
409 auto numMyRows = inputMatrix_->getLocalNumRows();
410 for (size_t m = 0; m < numVecs; ++m) {
411 for (size_t i = 0; i < numMyRows * bcrsBlockSize_; ++i) {
412 Y(i, m) = Y2(i, m);
413 }
414 }
415 }
416}
417
418template <class MatrixType>
419void Container<MatrixType>::
420 clearBlocks() {
421 numBlocks_ = 0;
422 blockRows_.clear();
423 blockSizes_.clear();
424 blockOffsets_.clear();
425 Diag_ = Teuchos::null; // Diag_ will be recreated if needed
426}
427
428// Implementation of Ifpack2::ContainerImpl
429
430template <class MatrixType, class LocalScalarType>
433 const Teuchos::RCP<const row_matrix_type>& matrix,
434 const Teuchos::Array<Teuchos::Array<LO>>& partitions,
435 bool pointIndexed)
436 : Container<MatrixType>(matrix, partitions, pointIndexed) {}
438template <class MatrixType, class LocalScalarType>
441
442template <class MatrixType, class LocalScalarType>
444 setParameters(const Teuchos::ParameterList& List) {}
445
446template <class MatrixType, class LocalScalarType>
448 applyInverseJacobi(const mv_type& /* X */, mv_type& /* Y */,
449 SC dampingFactor,
450 bool /* zeroStartingSolution = false */,
451 int /* numSweeps = 1 */) const {
452 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
453}
454
455template <class MatrixType, class LocalScalarType>
457 applyMV(const mv_type& X, mv_type& Y) const {
458 ConstHostView XView = X.getLocalViewHost(Tpetra::Access::ReadOnly);
459 HostView YView = Y.getLocalViewHost(Tpetra::Access::ReadWrite);
460 this->apply(XView, YView, 0);
462
463template <class MatrixType, class LocalScalarType>
465 weightedApplyMV(const mv_type& X,
466 mv_type& Y,
467 vector_type& W) const {
468 ConstHostView XView = X.getLocalViewHost(Tpetra::Access::ReadOnly);
469 HostView YView = Y.getLocalViewHost(Tpetra::Access::ReadWrite);
470 ConstHostView WView = W.getLocalViewHost(Tpetra::Access::ReadOnly);
471 weightedApply(XView, YView, WView, 0);
472}
473
474template <class MatrixType, class LocalScalarType>
476 getName() {
477 return "Generic";
478}
479
480template <class MatrixType, class LocalScalarType>
482 solveBlock(ConstHostSubviewLocal X,
483 HostSubviewLocal Y,
484 int blockIndex,
485 Teuchos::ETransp mode,
486 const LSC alpha,
487 const LSC beta) const {
488 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
489}
490
491template <class MatrixType, class LocalScalarType>
492typename ContainerImpl<MatrixType, LocalScalarType>::LO
494 translateRowToCol(LO row) {
495 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
496 GO GO_INVALID = Teuchos::OrdinalTraits<GO>::invalid();
497 const map_type& globalRowMap = *(this->inputMatrix_->getRowMap());
498 const map_type& globalColMap = *(this->inputMatrix_->getColMap());
499 LO rowLID = row;
500 LO dofOffset = 0;
501 if (this->pointIndexed_) {
502 rowLID = row / this->bcrsBlockSize_;
503 dofOffset = row % this->bcrsBlockSize_;
504 }
505 GO diagGID = globalRowMap.getGlobalElement(rowLID);
506 TEUCHOS_TEST_FOR_EXCEPTION(
507 diagGID == GO_INVALID,
508 std::runtime_error,
509 "Ifpack2::Container::translateRowToCol: "
510 "On process "
511 << this->inputMatrix_->getRowMap()->getComm()->getRank() << ", at least one row index in the set of local "
512 "row indices given to the constructor is not a valid local row index in "
513 "the input matrix's row Map on this process. This should be impossible "
514 "because the constructor checks for this case. Here is the complete set "
515 "of invalid local row indices: "
516 << rowLID << ". "
517 "Please report this bug to the Ifpack2 developers.");
518 // now, can translate diagGID (both a global row AND global col ID) to local column
519 LO colLID = globalColMap.getLocalElement(diagGID);
520 TEUCHOS_TEST_FOR_EXCEPTION(
521 colLID == LO_INVALID,
522 std::runtime_error,
523 "Ifpack2::Container::translateRowToCol: "
524 "On process "
525 << this->inputMatrix_->getRowMap()->getComm()->getRank() << ", "
526 "at least one row index in the set of row indices given to the constructor "
527 "does not have a corresponding column index in the input matrix's column "
528 "Map. This probably means that the column(s) in question is/are empty on "
529 "this process, which would make the submatrix to extract structurally "
530 "singular. The invalid global column index is "
531 << diagGID << ".");
532 // colLID could identify a block column - translate to split column if needed
533 if (this->pointIndexed_)
534 return colLID * this->bcrsBlockSize_ + dofOffset;
535 return colLID;
536}
537
538template <class MatrixType, class LocalScalarType>
540 apply(ConstHostView X,
541 HostView Y,
542 int blockIndex,
543 Teuchos::ETransp mode,
544 SC alpha,
545 SC beta) const {
546 using Teuchos::ArrayView;
547 using Teuchos::as;
548 using Teuchos::RCP;
549 using Teuchos::rcp;
550
551 // The local operator might have a different Scalar type than
552 // MatrixType. This means that we might have to convert X and Y to
553 // the Tpetra::MultiVector specialization that the local operator
554 // wants. This class' X_ and Y_ internal fields are of the right
555 // type for the local operator, so we can use those as targets.
556
558
559 TEUCHOS_TEST_FOR_EXCEPTION(
560 !this->IsComputed_, std::runtime_error,
561 "Ifpack2::Container::apply: "
562 "You must have called the compute() method before you may call apply(). "
563 "You may call the apply() method as many times as you want after calling "
564 "compute() once, but you must have called compute() at least once.");
565
566 const size_t numVecs = X.extent(1);
567
568 if (numVecs == 0) {
569 return; // done! nothing to do
570 }
571
572 // The local operator works on a permuted subset of the local parts
573 // of X and Y. The subset and permutation are defined by the index
574 // array returned by getBlockRows(). If the permutation is trivial
575 // and the subset is exactly equal to the local indices, then we
576 // could use the local parts of X and Y exactly, without needing to
577 // permute. Otherwise, we have to use temporary storage to permute
578 // X and Y. For now, we always use temporary storage.
579 //
580 // Create temporary permuted versions of the input and output.
581 // (Re)allocate X_ and/or Y_ only if necessary. We'll use them to
582 // store the permuted versions of X resp. Y. Note that X_local has
583 // the domain Map of the operator, which may be a permuted subset of
584 // the local Map corresponding to X.getMap(). Similarly, Y_local
585 // has the range Map of the operator, which may be a permuted subset
586 // of the local Map corresponding to Y.getMap(). numRows_ here
587 // gives the number of rows in the row Map of the local Inverse_
588 // operator.
589 //
590 // FIXME (mfh 20 Aug 2013) There might be an implicit assumption
591 // here that the row Map and the range Map of that operator are
592 // the same.
593 //
594 // FIXME (mfh 20 Aug 2013) This "local permutation" functionality
595 // really belongs in Tpetra.
596
597 if (X_localBlocks_.size() == 0 || X.extent(1) != X_local_.extent(1)) {
598 // need to resize (or create for the first time) the three scratch arrays
599 X_localBlocks_.clear();
600 Y_localBlocks_.clear();
601 X_localBlocks_.reserve(this->numBlocks_);
602 Y_localBlocks_.reserve(this->numBlocks_);
603
604 X_local_ = HostViewLocal("X_local", this->blockRows_.size() * this->scalarsPerRow_, numVecs);
605 Y_local_ = HostViewLocal("Y_local", this->blockRows_.size() * this->scalarsPerRow_, numVecs);
606
607 // create all X_local and Y_local managed Views at once, are
608 // reused in subsequent apply() calls
609 for (int i = 0; i < this->numBlocks_; i++) {
610 auto blockBounds = std::make_pair(this->blockOffsets_[i] * this->scalarsPerRow_,
611 (this->blockOffsets_[i] + this->blockSizes_[i]) * this->scalarsPerRow_);
612 X_localBlocks_.emplace_back(X_local_, blockBounds, Kokkos::ALL());
613 Y_localBlocks_.emplace_back(Y_local_, blockBounds, Kokkos::ALL());
614 }
615 }
616
617 const ArrayView<const LO> blockRows = this->getBlockRows(blockIndex);
618
619 if (this->scalarsPerRow_ == 1)
620 mvgs.gatherViewToView(X_localBlocks_[blockIndex], X, blockRows);
621 else
622 mvgs.gatherViewToViewBlock(X_localBlocks_[blockIndex], X, blockRows, this->scalarsPerRow_);
623
624 // We must gather the contents of the output multivector Y even on
625 // input to solveBlock(), since the inverse operator might use it as
626 // an initial guess for a linear solve. We have no way of knowing
627 // whether it does or does not.
628
629 if (this->scalarsPerRow_ == 1)
630 mvgs.gatherViewToView(Y_localBlocks_[blockIndex], Y, blockRows);
631 else
632 mvgs.gatherViewToViewBlock(Y_localBlocks_[blockIndex], Y, blockRows, this->scalarsPerRow_);
633
634 // Apply the local operator:
635 // Y_local := beta*Y_local + alpha*M^{-1}*X_local
636 this->solveBlock(X_localBlocks_[blockIndex], Y_localBlocks_[blockIndex], blockIndex, mode,
637 LSC(alpha), LSC(beta));
638
639 // Scatter the permuted subset output vector Y_local back into the
640 // original output multivector Y.
641 if (this->scalarsPerRow_ == 1)
642 mvgs.scatterViewToView(Y, Y_localBlocks_[blockIndex], blockRows);
643 else
644 mvgs.scatterViewToViewBlock(Y, Y_localBlocks_[blockIndex], blockRows, this->scalarsPerRow_);
645}
646
647template <class MatrixType, class LocalScalarType>
649 weightedApply(ConstHostView X,
650 HostView Y,
651 ConstHostView D,
652 int blockIndex,
653 Teuchos::ETransp mode,
654 SC alpha,
655 SC beta) const {
656 using std::endl;
657 using Teuchos::ArrayRCP;
658 using Teuchos::ArrayView;
659 using Teuchos::Ptr;
660 using Teuchos::ptr;
661 using Teuchos::Range1D;
662 using Teuchos::RCP;
663 using Teuchos::rcp;
664 using Teuchos::rcp_const_cast;
665 using STS = Teuchos::ScalarTraits<SC>;
666
667 // The local operator template parameter might have a different
668 // Scalar type than MatrixType. This means that we might have to
669 // convert X and Y to the Tpetra::MultiVector specialization that
670 // the local operator wants. This class' X_ and Y_ internal fields
671 // are of the right type for the local operator, so we can use those
672 // as targets.
673
674 const char prefix[] = "Ifpack2::Container::weightedApply: ";
675 TEUCHOS_TEST_FOR_EXCEPTION(
676 !this->IsComputed_, std::runtime_error, prefix << "You must have called the "
677 "compute() method before you may call this method. You may call "
678 "weightedApply() as many times as you want after calling compute() once, "
679 "but you must have called compute() at least once first.");
680
681 // bmk 7-2019: BlockRelaxation already checked this, but if that changes...
682 TEUCHOS_TEST_FOR_EXCEPTION(
683 this->scalarsPerRow_ > 1, std::logic_error, prefix << "Use of block rows isn't allowed "
684 "in overlapping Jacobi (the only method that uses weightedApply");
685
686 const size_t numVecs = X.extent(1);
687
688 TEUCHOS_TEST_FOR_EXCEPTION(
689 X.extent(1) != Y.extent(1), std::runtime_error,
690 prefix << "X and Y have different numbers of vectors (columns). X has "
691 << X.extent(1) << ", but Y has " << Y.extent(1) << ".");
692
693 if (numVecs == 0) {
694 return; // done! nothing to do
695 }
696
697 const size_t numRows = this->blockSizes_[blockIndex];
698
699 // The local operator works on a permuted subset of the local parts
700 // of X and Y. The subset and permutation are defined by the index
701 // array returned by getBlockRows(). If the permutation is trivial
702 // and the subset is exactly equal to the local indices, then we
703 // could use the local parts of X and Y exactly, without needing to
704 // permute. Otherwise, we have to use temporary storage to permute
705 // X and Y. For now, we always use temporary storage.
706 //
707 // Create temporary permuted versions of the input and output.
708 // (Re)allocate X_ and/or Y_ only if necessary. We'll use them to
709 // store the permuted versions of X resp. Y. Note that X_local has
710 // the domain Map of the operator, which may be a permuted subset of
711 // the local Map corresponding to X.getMap(). Similarly, Y_local
712 // has the range Map of the operator, which may be a permuted subset
713 // of the local Map corresponding to Y.getMap(). numRows_ here
714 // gives the number of rows in the row Map of the local operator.
715 //
716 // FIXME (mfh 20 Aug 2013) There might be an implicit assumption
717 // here that the row Map and the range Map of that operator are
718 // the same.
719 //
720 // FIXME (mfh 20 Aug 2013) This "local permutation" functionality
721 // really belongs in Tpetra.
722 if (X_localBlocks_.size() == 0 || X.extent(1) != X_local_.extent(1)) {
723 // need to resize (or create for the first time) the three scratch arrays
724 X_localBlocks_.clear();
725 Y_localBlocks_.clear();
726 X_localBlocks_.reserve(this->numBlocks_);
727 Y_localBlocks_.reserve(this->numBlocks_);
728
729 X_local_ = HostViewLocal("X_local", this->blockRows_.size() * this->scalarsPerRow_, numVecs);
730 Y_local_ = HostViewLocal("Y_local", this->blockRows_.size() * this->scalarsPerRow_, numVecs);
731
732 // create all X_local and Y_local managed Views at once, are
733 // reused in subsequent apply() calls
734 for (int i = 0; i < this->numBlocks_; i++) {
735 auto blockBounds = std::make_pair(this->blockOffsets_[i] * this->scalarsPerRow_,
736 (this->blockOffsets_[i] + this->blockSizes_[i]) * this->scalarsPerRow_);
737 X_localBlocks_.emplace_back(X_local_, blockBounds, Kokkos::ALL());
738 Y_localBlocks_.emplace_back(Y_local_, blockBounds, Kokkos::ALL());
739 }
740 }
741 if (int(weightedApplyScratch_.extent(0)) != 3 * this->maxBlockSize_ ||
742 weightedApplyScratch_.extent(1) != numVecs) {
743 weightedApplyScratch_ = HostViewLocal("weightedApply scratch", 3 * this->maxBlockSize_, numVecs);
744 }
745
746 ArrayView<const LO> blockRows = this->getBlockRows(blockIndex);
747
749
750 // note: BlockCrs w/ weighted Jacobi isn't allowed, so no need to use block gather/scatter
751 mvgs.gatherViewToView(X_localBlocks_[blockIndex], X, blockRows);
752 // We must gather the output multivector Y even on input to
753 // solveBlock(), since the local operator might use it as an initial
754 // guess for a linear solve. We have no way of knowing whether it
755 // does or does not.
756
757 mvgs.gatherViewToView(Y_localBlocks_[blockIndex], Y, blockRows);
758
759 // Apply the diagonal scaling D to the input X. It's our choice
760 // whether the result has the original input Map of X, or the
761 // permuted subset Map of X_local. If the latter, we also need to
762 // gather D into the permuted subset Map. We choose the latter, to
763 // save memory and computation. Thus, we do the following:
764 //
765 // 1. Gather D into a temporary vector D_local.
766 // 2. Create a temporary X_scaled to hold diag(D_local) * X_local.
767 // 3. Compute X_scaled := diag(D_loca) * X_local.
768 auto maxBS = this->maxBlockSize_;
769 auto bs = this->blockSizes_[blockIndex] * this->scalarsPerRow_;
770
771 HostSubviewLocal D_local(weightedApplyScratch_, std::make_pair(0, bs), std::make_pair(0, 1));
772 mvgs.gatherViewToView(D_local, D, blockRows);
773 HostSubviewLocal X_scaled(weightedApplyScratch_, std::make_pair(maxBS, maxBS + bs), Kokkos::ALL());
774 for (size_t j = 0; j < numVecs; j++)
775 for (size_t i = 0; i < numRows; i++)
776 X_scaled(i, j) = X_localBlocks_[blockIndex](i, j) * D_local(i, 0);
777
778 HostSubviewLocal Y_temp(weightedApplyScratch_, std::make_pair(maxBS * 2, maxBS * 2 + bs), Kokkos::ALL());
779 // Apply the local operator: Y_temp := M^{-1} * X_scaled
780 this->solveBlock(X_scaled, Y_temp, blockIndex, mode, STS::one(), STS::zero());
781 // Y_local := beta * Y_local + alpha * diag(D_local) * Y_temp.
782 //
783 // Note that we still use the permuted subset scaling D_local here,
784 // because Y_temp has the same permuted subset Map. That's good, in
785 // fact, because it's a subset: less data to read and multiply.
786 LISC a = alpha;
787 LISC b = beta;
788 for (size_t j = 0; j < numVecs; j++)
789 for (size_t i = 0; i < numRows; i++)
790 Y_localBlocks_[blockIndex](i, j) = b * Y_localBlocks_[blockIndex](i, j) + a * Y_temp(i, j) * D_local(i, 0);
791
792 // Copy the permuted subset output vector Y_local into the original
793 // output multivector Y.
794 mvgs.scatterViewToView(Y, Y_localBlocks_[blockIndex], blockRows);
795}
796
797template <class MatrixType, class LocalScalarType>
799 typename ContainerImpl<MatrixType, LocalScalarType>::SC,
800 typename ContainerImpl<MatrixType, LocalScalarType>::LO,
801 typename ContainerImpl<MatrixType, LocalScalarType>::GO,
802 typename ContainerImpl<MatrixType, LocalScalarType>::NO>
804 getInputRowView(LO row) const {
805 typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
806 typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type;
807
808 typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type;
809 typedef typename MatrixType::values_host_view_type values_host_view_type;
810 using IST = typename row_matrix_type::impl_scalar_type;
811
812 if (this->hasBlockCrs_) {
813 using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type;
814 using h_vals_type = typename block_crs_matrix_type::values_host_view_type;
815 h_inds_type colinds;
816 h_vals_type values;
817 this->inputBlockMatrix_->getLocalRowView(row / this->bcrsBlockSize_, colinds, values);
818 size_t numEntries = colinds.size();
819 // CMS: Can't say I understand what this really does
820 // return StridedRowView(values + row % this->bcrsBlockSize_, colinds, this->bcrsBlockSize_, numEntries * this->bcrsBlockSize_);
821 h_vals_type subvals = Kokkos::subview(values, std::pair<size_t, size_t>(row % this->bcrsBlockSize_, values.size()));
822 return StridedRowView(subvals, colinds, this->bcrsBlockSize_, numEntries * this->bcrsBlockSize_);
823 } else if (!this->inputMatrix_->supportsRowViews()) {
824 size_t maxEntries = this->inputMatrix_->getLocalMaxNumRowEntries();
825 Teuchos::Array<LO> inds(maxEntries);
826 Teuchos::Array<SC> vals(maxEntries);
827 nonconst_local_inds_host_view_type inds_v(inds.data(), maxEntries);
828 nonconst_values_host_view_type vals_v(reinterpret_cast<IST*>(vals.data()), maxEntries);
829 size_t numEntries;
830 this->inputMatrix_->getLocalRowCopy(row, inds_v, vals_v, numEntries);
831 vals.resize(numEntries);
832 inds.resize(numEntries);
833 return StridedRowView(vals, inds);
834 } else {
835 // CMS - This is dangerous and might not work.
836 local_inds_host_view_type colinds;
837 values_host_view_type values;
838 this->inputMatrix_->getLocalRowView(row, colinds, values);
839 return StridedRowView(values, colinds, 1, colinds.size());
840 }
841}
842
843template <class MatrixType, class LocalScalarType>
845 clearBlocks() {
846 X_localBlocks_.clear();
847 Y_localBlocks_.clear();
848 X_local_ = HostViewLocal();
849 Y_local_ = HostViewLocal();
851}
852
853namespace Details {
854
855// Implementation of Ifpack2::Details::StridedRowView
856template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
858 StridedRowView(h_vals_type vals_, h_inds_type inds_, int blockSize_, size_t nnz_)
859 : vals(vals_)
860 , inds(inds_)
861 , blockSize(blockSize_)
862 , nnz(nnz_) {}
863
864template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
866 StridedRowView(Teuchos::Array<SC>& vals_, Teuchos::Array<LO>& inds_)
867 : vals()
868 , inds()
869 , blockSize(1)
870 , nnz(vals_.size()) {
871 valsCopy.swap(vals_);
872 indsCopy.swap(inds_);
873}
874
875template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
877 val(size_t i) const {
879 TEUCHOS_TEST_FOR_EXCEPTION(i >= nnz, std::runtime_error,
880 "Out-of-bounds access into Ifpack2::Container::StridedRowView");
881 }
882 if (vals.size() > 0) {
883 if (blockSize == 1)
884 return vals[i];
885 else
886 return vals[i * blockSize];
887 } else
888 return valsCopy[i];
889}
890
891template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
892LocalOrdinal StridedRowView<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
893 ind(size_t i) const {
895 TEUCHOS_TEST_FOR_EXCEPTION(i >= nnz, std::runtime_error,
896 "Out-of-bounds access into Ifpack2::Container::StridedRowView");
897 }
898 // inds is smaller than vals by a factor of the block size (dofs/node)
899 if (inds.size() > 0) {
900 if (blockSize == 1)
901 return inds[i];
902 else
903 return inds[i / blockSize] * blockSize + i % blockSize;
904 } else
905 return indsCopy[i];
906}
907
908template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
909size_t StridedRowView<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
910 size() const {
911 return nnz;
912}
913} // namespace Details
914
915} // namespace Ifpack2
916
917template <class MatrixType>
918std::ostream& operator<<(std::ostream& os, const Ifpack2::Container<MatrixType>& obj) {
919 return obj.print(os);
920}
921
922#define IFPACK2_CONTAINER_INSTANT(S, LO, GO, N) \
923 template class Ifpack2::Container<Tpetra::RowMatrix<S, LO, GO, N>>; \
924 template class Ifpack2::ContainerImpl<Tpetra::RowMatrix<S, LO, GO, N>, S>; \
925 template class Ifpack2::Details::StridedRowView<S, LO, GO, N>; \
926 template std::ostream& operator<< <Tpetra::RowMatrix<S, LO, GO, N>>( \
927 std::ostream& os, const Ifpack2::Container<Tpetra::RowMatrix<S, LO, GO, N>>& obj);
928
929#endif
Declaration of Ifpack2::Details::Behavior, a class that describes Ifpack2's run-time behavior.
Declaration and definition of the Ifpack2::Details::MultiVectorLocalGatherScatter class.
Interface for creating and solving a set of local linear problems.
Definition Ifpack2_Container_decl.hpp:79
virtual ~Container()
Destructor.
Definition Ifpack2_Container_def.hpp:81
bool hasBlockCrs_
Whether the input matrix is a BlockCRS matrix.
Definition Ifpack2_Container_decl.hpp:275
bool IsParallel_
Whether the problem is distributed across multiple MPI processes.
Definition Ifpack2_Container_decl.hpp:267
int numBlocks_
The number of blocks (partitions) in the container.
Definition Ifpack2_Container_decl.hpp:257
Teuchos::ArrayView< const LO > getBlockRows(int blockIndex) const
Local indices of the rows of the input matrix that belong to this block.
Definition Ifpack2_Container_def.hpp:85
static std::string getName()
Definition Ifpack2_Container_def.hpp:147
int bcrsBlockSize_
If hasBlockCrs_, the number of DOFs per vertex. Otherwise 1.
Definition Ifpack2_Container_decl.hpp:277
Teuchos::RCP< const block_crs_matrix_type > inputBlockMatrix_
The input matrix, dynamic cast to BlockCrsMatrix. May be null.
Definition Ifpack2_Container_decl.hpp:254
virtual void DoGSBlock(ConstHostView X, HostView Y, HostView Y2, HostView Resid, SC dampingFactor, LO i) const
Do one step of Gauss-Seidel on block i (used by DoGaussSeidel and DoSGS)
Definition Ifpack2_Container_def.hpp:152
virtual void applyMV(const mv_type &X, mv_type &Y) const
Wrapper for apply with MultiVector.
Definition Ifpack2_Container_def.hpp:135
LO NumLocalRows_
Number of local rows in input matrix.
Definition Ifpack2_Container_decl.hpp:269
bool isInitialized() const
Whether the container has been successfully initialized.
Definition Ifpack2_Container_def.hpp:124
void setBlockSizes(const Teuchos::Array< Teuchos::Array< LO > > &partitions)
Initialize arrays with information about block sizes.
Definition Ifpack2_Container_def.hpp:90
LO scalarsPerRow_
Definition Ifpack2_Container_decl.hpp:282
Teuchos::Array< LO > blockSizes_
Number of rows in each block.
Definition Ifpack2_Container_decl.hpp:261
GO NumGlobalNonzeros_
Number of nonzeros in input matrix.
Definition Ifpack2_Container_decl.hpp:273
Container(const Teuchos::RCP< const row_matrix_type > &matrix, const Teuchos::Array< Teuchos::Array< LO > > &partitions, bool pointIndexed)
Constructor.
Definition Ifpack2_Container_def.hpp:22
virtual std::ostream & print(std::ostream &os) const =0
Print basic information about the container to os.
virtual void weightedApplyMV(const mv_type &X, mv_type &Y, vector_type &W) const
Wrapper for weightedApply with MultiVector.
Definition Ifpack2_Container_def.hpp:141
Teuchos::RCP< const row_matrix_type > inputMatrix_
The input matrix to the constructor.
Definition Ifpack2_Container_decl.hpp:248
bool pointIndexed_
(If hasBlockCrs_) Whether the blocks are described using sub-block row indices instead of full block ...
Definition Ifpack2_Container_decl.hpp:279
typename KokkosKernels::ArithTraits< SC >::val_type ISC
Internal representation of Scalar in Kokkos::View.
Definition Ifpack2_Container_decl.hpp:101
GO NumGlobalRows_
Number of global rows in input matrix.
Definition Ifpack2_Container_decl.hpp:271
bool isComputed() const
Whether the container has been successfully computed.
Definition Ifpack2_Container_def.hpp:129
typename mv_type::dual_view_type::t_host HostView
Definition Ifpack2_Container_decl.hpp:105
The implementation of the numerical features of Container (Jacobi, Gauss-Seidel, SGS)....
Definition Ifpack2_Container_decl.hpp:307
LocalScalarType LSC
The internal representation of LocalScalarType in Kokkos::View.
Definition Ifpack2_Container_decl.hpp:326
void DoGSBlock(ConstHostView X, HostView Y, HostView Y2, HostView Resid, SC dampingFactor, LO i) const
Do one step of Gauss-Seidel on block i (used by DoGaussSeidel and DoSGS)
Definition Ifpack2_Container_def.hpp:232
static bool debug()
Whether Ifpack2 is in debug mode.
Definition Ifpack2_Details_Behavior.cpp:31
Ifpack2's implementation of Trilinos::Details::LinearSolver interface.
Definition Ifpack2_Details_LinearSolver_decl.hpp:75
Ifpack2 implementation details.
Preconditioners and smoothers for Tpetra sparse matrices.
Definition Ifpack2_AdditiveSchwarz_decl.hpp:40
Structure for read-only views of general matrix rows.
Definition Ifpack2_Container_decl.hpp:498
StridedRowView(h_vals_type vals_, h_inds_type inds_, int blockSize_, size_t nnz_)
Constructor for row views (preferred)
Definition Ifpack2_Container_def.hpp:858