Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_CrsMatrixMultiplyOp.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_CRSMATRIXMULTIPLYOP_HPP
11#define TPETRA_CRSMATRIXMULTIPLYOP_HPP
12
17
19#include "Tpetra_CrsMatrix.hpp"
20#include "Tpetra_Util.hpp"
23#include "Tpetra_LocalCrsMatrixOperator.hpp"
24
25namespace Tpetra {
26
63template <class Scalar,
64 class MatScalar,
65 class LocalOrdinal,
66 class GlobalOrdinal,
67 class Node>
68class CrsMatrixMultiplyOp : public Operator<Scalar, LocalOrdinal, GlobalOrdinal, Node> {
69 public:
75
76 private:
77 using local_matrix_device_type =
78 typename crs_matrix_type::local_matrix_device_type;
79
80 public:
82
83
88 CrsMatrixMultiplyOp(const Teuchos::RCP<const crs_matrix_type>& A)
89 : matrix_(A) {}
90
92 ~CrsMatrixMultiplyOp() override = default;
93
95
97
100 void
103 Teuchos::ETransp mode = Teuchos::NO_TRANS,
104 Scalar alpha = Teuchos::ScalarTraits<Scalar>::one(),
105 Scalar beta = Teuchos::ScalarTraits<Scalar>::zero()) const override {
106 TEUCHOS_TEST_FOR_EXCEPTION(!matrix_->isFillComplete(), std::runtime_error,
107 Teuchos::typeName(*this) << "::apply(): underlying matrix is not fill-complete.");
108 TEUCHOS_TEST_FOR_EXCEPTION(X.getNumVectors() != Y.getNumVectors(), std::runtime_error,
109 Teuchos::typeName(*this) << "::apply(X,Y): X and Y must have the same number of vectors.");
110 TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::ScalarTraits<Scalar>::isComplex && mode == Teuchos::TRANS, std::logic_error,
111 Teuchos::typeName(*this) << "::apply() does not currently support transposed multiplications for complex scalar types.");
112 if (mode == Teuchos::NO_TRANS) {
114 } else {
116 }
117 }
118
124 bool hasTransposeApply() const override {
125 return true;
126 }
127
129 Teuchos::RCP<const map_type> getDomainMap() const override {
130 return matrix_->getDomainMap();
131 }
132
134 Teuchos::RCP<const map_type> getRangeMap() const override {
135 return matrix_->getRangeMap();
136 }
137
139
140 protected:
142
143 using local_matrix_op_t =
145 typename crs_matrix_type::device_type>;
146
148 const Teuchos::RCP<const crs_matrix_type> matrix_;
149
162 mutable Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > importMV_;
163
176 mutable Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > exportMV_;
177
180 void
183 Teuchos::ETransp mode,
185 Scalar beta) const {
186 using Teuchos::null;
187 using Teuchos::RCP;
188 using Teuchos::rcp;
191 using STS = Teuchos::ScalarTraits<Scalar>;
193
194 const size_t numVectors = X_in.getNumVectors();
195 // because of Views, it is difficult to determine if X and Y point to the same data.
196 // however, if they reference the exact same object, we will do the user the favor of copying X into new storage (with a warning)
197 // we ony need to do this if we have trivial importers; otherwise, we don't actually apply the operator from X into Y
198 RCP<const import_type> importer = matrix_->getGraph()->getImporter();
199 RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
200
201 // some parameters for below
202 const bool Y_is_replicated = !Y_in.isDistributed();
203 const bool Y_is_overwritten = (beta == STS::zero());
204 const int myRank = matrix_->getComm()->getRank();
205 if (Y_is_replicated && myRank != 0) {
206 beta = STS::zero();
207 }
208
209 // access X indirectly, in case we need to create temporary storage
211 // currently, cannot multiply from multivector of non-constant stride
212 if (!X_in.isConstantStride() && importer == null) {
213 // generate a strided copy of X_in
214 X = Teuchos::rcp(new MV(X_in, Teuchos::Copy));
215 } else {
216 // just temporary, so this non-owning RCP is okay
217 X = Teuchos::rcpFromRef(X_in);
218 }
219
220 // set up import/export temporary multivectors
221 if (importer != null) {
222 if (importMV_ != null && importMV_->getNumVectors() != numVectors) {
223 importMV_ = null;
224 }
225 if (importMV_ == null) {
226 importMV_ = rcp(new MV(matrix_->getColMap(), numVectors));
227 }
228 }
229 if (exporter != null) {
230 if (exportMV_ != null && exportMV_->getNumVectors() != numVectors) {
231 exportMV_ = null;
232 }
233 if (exportMV_ == null) {
234 exportMV_ = rcp(new MV(matrix_->getRowMap(), numVectors));
235 }
236 }
237
238 // If we have a non-trivial exporter, we must import elements that are permuted or are on other processors
239 if (exporter != null) {
240 exportMV_->doImport(X_in, *exporter, INSERT);
241 X = exportMV_; // multiply out of exportMV_
242 }
243
244 auto X_lcl = X->getLocalViewDevice(Access::ReadOnly);
245
246 auto localMultiply = local_matrix_op_t(std::make_shared<local_matrix_device_type>(matrix_->getLocalMatrixDevice()));
247
248 // If we have a non-trivial importer, we must export elements that are permuted or belong to other processors
249 // We will compute solution into the to-be-exported MV; get a view
250 if (importer != null) {
251 // Beta is zero here, so we clobber Y_lcl
252 auto Y_lcl = importMV_->getLocalViewDevice(Access::OverwriteAll);
253
254 localMultiply.apply(X_lcl, Y_lcl, mode, alpha, STS::zero());
255 if (Y_is_overwritten) {
256 Y_in.putScalar(STS::zero());
257 } else {
258 Y_in.scale(beta);
259 }
260 Y_in.doExport(*importMV_, *importer, ADD_ASSIGN);
261 }
262 // otherwise, multiply into Y
263 else {
264 // can't multiply in-situ; can't multiply into non-strided multivector
265 if (!Y_in.isConstantStride() || X.getRawPtr() == &Y_in) {
266 // generate a strided copy of Y
267 MV Y(Y_in, Teuchos::Copy);
270 Y_lcl = Y.getLocalViewDevice(Access::OverwriteAll);
271 else
272 Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
273
276 } else {
279 Y_lcl = Y_in.getLocalViewDevice(Access::OverwriteAll);
280 else
281 Y_lcl = Y_in.getLocalViewDevice(Access::ReadWrite);
282
284 }
285 }
286 // Handle case of rangemap being a local replicated map: in this case, sum contributions from each processor
287 if (Y_is_replicated) {
288 Y_in.reduce();
289 }
290 }
291
293 void
297 Scalar beta) const {
298 using Teuchos::NO_TRANS;
299 using Teuchos::RCP;
300 using Teuchos::rcp;
301 using Teuchos::rcp_const_cast;
302 using Teuchos::rcpFromRef;
306 typedef Teuchos::ScalarTraits<Scalar> STS;
308
309 if (alpha == STS::zero()) {
310 if (beta == STS::zero()) {
311 Y_in.putScalar(STS::zero());
312 } else if (beta != STS::one()) {
313 Y_in.scale(beta);
314 }
315 return;
316 }
317
318 // It's possible that X is a view of Y or vice versa. We don't
319 // allow this (apply() requires that X and Y not alias one
320 // another), but it's helpful to detect and work around this
321 // case. We don't try to to detect the more subtle cases (e.g.,
322 // one is a subview of the other, but their initial pointers
323 // differ). We only need to do this if this matrix's Import is
324 // trivial; otherwise, we don't actually apply the operator from
325 // X into Y.
326
327 RCP<const import_type> importer = matrix_->getGraph()->getImporter();
328 RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
329
330 // If beta == 0, then the output MV will be overwritten; none of
331 // its entries should be read. (Sparse BLAS semantics say that we
332 // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
333 // This matters if we need to do an Export operation; see below.
334 const bool Y_is_overwritten = (beta == STS::zero());
335
336 // We treat the case of a replicated MV output specially.
337 const bool Y_is_replicated =
338 (!Y_in.isDistributed() && matrix_->getComm()->getSize() != 1);
339
340 // This is part of the special case for replicated MV output.
341 // We'll let each process do its thing, but do an all-reduce at
342 // the end to sum up the results. Setting beta=0 on all
343 // processes but Proc 0 makes the math work out for the
344 // all-reduce. (This assumes that the replicated data is
345 // correctly replicated, so that the data are the same on all
346 // processes.)
347 if (Y_is_replicated && matrix_->getComm()->getRank() > 0) {
348 beta = STS::zero();
349 }
350
351 // Temporary MV for Import operation. After the block of code
352 // below, this will be an (Imported if necessary) column Map MV
353 // ready to give to localMultiply.apply(...).
355 if (importer.is_null()) {
356 if (!X_in.isConstantStride()) {
357 // Not all sparse mat-vec kernels can handle an input MV with
358 // nonconstant stride correctly, so we have to copy it in that
359 // case into a constant stride MV. To make a constant stride
360 // copy of X_in, we force creation of the column (== domain)
361 // Map MV (if it hasn't already been created, else fetch the
362 // cached copy). This avoids creating a new MV each time.
363 RCP<MV> X_colMapNonConst = getColumnMapMultiVector(X_in, true);
366 } else {
367 // The domain and column Maps are the same, so do the local
368 // multiply using the domain Map input MV X_in.
370 }
371 } else { // need to Import source (multi)vector
372 ProfilingRegion regionImport("Tpetra::CrsMatrixMultiplyOp::apply: Import");
373
374 // We're doing an Import anyway, which will copy the relevant
375 // elements of the domain Map MV X_in into a separate column Map
376 // MV. Thus, we don't have to worry whether X_in is constant
377 // stride.
378 RCP<MV> X_colMapNonConst = getColumnMapMultiVector(X_in);
379
380 // Import from the domain Map MV to the column Map MV.
381 X_colMapNonConst->doImport(X_in, *importer, INSERT);
383 }
384
385 // Temporary MV for doExport (if needed), or for copying a
386 // nonconstant stride output MV into a constant stride MV. This
387 // is null if we don't need the temporary MV, that is, if the
388 // Export is trivial (null).
389 RCP<MV> Y_rowMap = getRowMapMultiVector(Y_in);
390
391 auto X_lcl = X_colMap->getLocalViewDevice(Access::ReadOnly);
392 auto localMultiply = local_matrix_op_t(std::make_shared<local_matrix_device_type>(matrix_->getLocalMatrixDevice()));
393
394 // If we have a nontrivial Export object, we must perform an
395 // Export. In that case, the local multiply result will go into
396 // the row Map multivector. We don't have to make a
397 // constant-stride version of Y_in in this case, because we had to
398 // make a constant stride Y_rowMap MV and do an Export anyway.
399 if (!exporter.is_null()) {
400 auto Y_lcl = Y_rowMap->getLocalViewDevice(Access::OverwriteAll);
401
403 alpha, STS::zero());
404 {
405 ProfilingRegion regionExport("Tpetra::CrsMatrixMultiplyOp::apply: Export");
406
407 // If we're overwriting the output MV Y_in completely (beta
408 // == 0), then make sure that it is filled with zeros before
409 // we do the Export. Otherwise, the ADD combine mode will
410 // use data in Y_in, which is supposed to be zero.
411 if (Y_is_overwritten) {
412 Y_in.putScalar(STS::zero());
413 } else {
414 // Scale the output MV by beta, so that the Export sums in
415 // the mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
416 Y_in.scale(beta);
417 }
418 // Do the Export operation.
419 Y_in.doExport(*Y_rowMap, *exporter, ADD_ASSIGN);
420 }
421 } else { // Don't do an Export: row Map and range Map are the same.
422 //
423 // If Y_in does not have constant stride, or if the column Map
424 // MV aliases Y_in, then we can't let the kernel write directly
425 // to Y_in. Instead, we have to use the cached row (== range)
426 // Map MV as temporary storage.
427 //
428 // FIXME (mfh 05 Jun 2014, mfh 07 Dec 2018) This test for
429 // aliasing only tests if the user passed in the same
430 // MultiVector for both X and Y. It won't detect whether one
431 // MultiVector views the other. We should also check the
432 // MultiVectors' raw data pointers.
433 if (!Y_in.isConstantStride() || X_colMap.getRawPtr() == &Y_in) {
434 // Force creating the MV if it hasn't been created already.
435 // This will reuse a previously created cached MV.
436 Y_rowMap = getRowMapMultiVector(Y_in, true);
437
438 // If beta == 0, we don't need to copy Y_in into Y_rowMap,
439 // since we're overwriting it anyway.
440 if (beta != STS::zero()) {
442 }
445 Y_lcl = Y_rowMap->getLocalViewDevice(Access::OverwriteAll);
446 else
447 Y_lcl = Y_rowMap->getLocalViewDevice(Access::ReadWrite);
448
451 } else {
454 Y_lcl = Y_in.getLocalViewDevice(Access::OverwriteAll);
455 else
456 Y_lcl = Y_in.getLocalViewDevice(Access::ReadWrite);
457
459 }
460 }
461
462 // If the range Map is a locally replicated Map, sum up
463 // contributions from each process. We set beta = 0 on all
464 // processes but Proc 0 initially, so this will handle the scaling
465 // factor beta correctly.
466 if (Y_is_replicated) {
467 ProfilingRegion regionReduce("Tpetra::CrsMatrixMultiplyOp::apply: Reduce Y");
468 Y_in.reduce();
469 }
470 }
471
472 private:
492 Teuchos::RCP<MV>
494 const bool force = false) const {
495 using Teuchos::null;
496 using Teuchos::RCP;
497 using Teuchos::rcp;
499
500 const size_t numVecs = X_domainMap.getNumVectors();
501 RCP<const import_type> importer = matrix_->getGraph()->getImporter();
502 RCP<const map_type> colMap = matrix_->getColMap();
503
504 RCP<MV> X_colMap; // null by default
505
506 // If the Import object is trivial (null), then we don't need a
507 // separate column Map multivector. Just return null in that
508 // case. The caller is responsible for knowing not to use the
509 // returned null pointer.
510 //
511 // If the Import is nontrivial, then we do need a separate
512 // column Map multivector for the Import operation. Check in
513 // that case if we have to (re)create the column Map
514 // multivector.
515 if (!importer.is_null() || force) {
516 if (importMV_.is_null() || importMV_->getNumVectors() != numVecs) {
517 X_colMap = rcp(new MV(colMap, numVecs));
518
519 // Cache the newly created multivector for later reuse.
521 } else { // Yay, we can reuse the cached multivector!
522 X_colMap = importMV_;
523 // mfh 09 Jan 2013: We don't have to fill with zeros first,
524 // because the Import uses INSERT combine mode, which overwrites
525 // existing entries.
526 //
527 // X_colMap->putScalar (STS::zero ());
528 }
529 }
530 return X_colMap;
531 }
532
554 Teuchos::RCP<MV>
555 getRowMapMultiVector(const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Y_rangeMap,
556 const bool force = false) const {
557 using Teuchos::null;
558 using Teuchos::RCP;
559 using Teuchos::rcp;
560 typedef Export<LocalOrdinal, GlobalOrdinal, Node> export_type;
561
562 const size_t numVecs = Y_rangeMap.getNumVectors();
563 RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
564 RCP<const map_type> rowMap = matrix_->getRowMap();
565
566 RCP<MV> Y_rowMap; // null by default
567
568 // If the Export object is trivial (null), then we don't need a
569 // separate row Map multivector. Just return null in that case.
570 // The caller is responsible for knowing not to use the returned
571 // null pointer.
572 //
573 // If the Export is nontrivial, then we do need a separate row
574 // Map multivector for the Export operation. Check in that case
575 // if we have to (re)create the row Map multivector.
576 if (!exporter.is_null() || force) {
577 if (exportMV_.is_null() || exportMV_->getNumVectors() != numVecs) {
578 Y_rowMap = rcp(new MV(rowMap, numVecs));
579 exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse
580 } else { // Yay, we can reuse the cached multivector!
581 Y_rowMap = exportMV_;
582 }
583 }
584 return Y_rowMap;
585 }
586};
587
595template <class OpScalar,
596 class MatScalar,
597 class LocalOrdinal,
598 class GlobalOrdinal,
599 class Node>
600Teuchos::RCP<
601 CrsMatrixMultiplyOp<OpScalar, MatScalar, LocalOrdinal, GlobalOrdinal, Node> >
602createCrsMatrixMultiplyOp(const Teuchos::RCP<
605 GlobalOrdinal, Node>
606 op_type;
607 return Teuchos::rcp(new op_type(A));
608}
609
610} // end of namespace Tpetra
611
612#endif // TPETRA_CRSMATRIXMULTIPLYOP_HPP
Forward declaration of Tpetra::CrsMatrixMultiplyOp.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Stand-alone utility functions and macros.
A class for wrapping a CrsMatrix multiply in a Operator.
bool hasTransposeApply() const override
Whether this Operator's apply() method can apply the transpose or conjugate transpose.
void applyNonTranspose(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X_in, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y_in, Scalar alpha, Scalar beta) const
Apply the matrix (not its transpose) to X_in, producing Y_in.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > exportMV_
Row Map MultiVector used in apply().
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute Y = beta*Y + alpha*Op(A)*X, where Op(A) is either A, , or .
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this Operator.
void applyTranspose(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X_in, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y_in, Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Apply the transpose or conjugate transpose of the matrix to X_in, producing Y_in.
CrsMatrixMultiplyOp(const Teuchos::RCP< const crs_matrix_type > &A)
Constructor.
~CrsMatrixMultiplyOp() override=default
Destructor (virtual for memory safety of derived classes).
Teuchos::RCP< CrsMatrixMultiplyOp< OpScalar, MatScalar, LocalOrdinal, GlobalOrdinal, Node > > createCrsMatrixMultiplyOp(const Teuchos::RCP< const CrsMatrix< MatScalar, LocalOrdinal, GlobalOrdinal, Node > > &A)
Non-member function to create a CrsMatrixMultiplyOp.
const Teuchos::RCP< const crs_matrix_type > matrix_
The underlying CrsMatrix object.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > importMV_
Column Map MultiVector used in apply().
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this Operator.
Struct that holds views of the contents of a CrsMatrix.
Abstract interface for local operators (e.g., matrices and preconditioners).
One or more distributed dense vectors.
Abstract interface for operators (e.g., matrices and preconditioners).
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes)
@ INSERT
Insert new values that don't currently exist.