Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_CrsMatrixMultiplyOp.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_CRSMATRIXMULTIPLYOP_HPP
11#define TPETRA_CRSMATRIXMULTIPLYOP_HPP
12
17
19#include "Tpetra_CrsMatrix.hpp"
20#include "Tpetra_Util.hpp"
23#include "Tpetra_LocalCrsMatrixOperator.hpp"
24
25namespace Tpetra {
26
63template <class Scalar,
64 class MatScalar,
65 class LocalOrdinal,
66 class GlobalOrdinal,
67 class Node>
68class CrsMatrixMultiplyOp : public Operator<Scalar, LocalOrdinal, GlobalOrdinal, Node> {
69 public:
75
76 private:
77 using local_matrix_device_type =
78 typename crs_matrix_type::local_matrix_device_type;
79
80 public:
82
83
88 CrsMatrixMultiplyOp(const Teuchos::RCP<const crs_matrix_type>& A)
89 : matrix_(A)
90 , localMultiply_(std::make_shared<local_matrix_device_type>(
91 A->getLocalMatrixDevice())) {}
92
94 ~CrsMatrixMultiplyOp() override = default;
95
97
99
102 void
105 Teuchos::ETransp mode = Teuchos::NO_TRANS,
106 Scalar alpha = Teuchos::ScalarTraits<Scalar>::one(),
107 Scalar beta = Teuchos::ScalarTraits<Scalar>::zero()) const override {
108 TEUCHOS_TEST_FOR_EXCEPTION(!matrix_->isFillComplete(), std::runtime_error,
109 Teuchos::typeName(*this) << "::apply(): underlying matrix is not fill-complete.");
110 TEUCHOS_TEST_FOR_EXCEPTION(X.getNumVectors() != Y.getNumVectors(), std::runtime_error,
111 Teuchos::typeName(*this) << "::apply(X,Y): X and Y must have the same number of vectors.");
112 TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::ScalarTraits<Scalar>::isComplex && mode == Teuchos::TRANS, std::logic_error,
113 Teuchos::typeName(*this) << "::apply() does not currently support transposed multiplications for complex scalar types.");
114 if (mode == Teuchos::NO_TRANS) {
116 } else {
118 }
119 }
120
126 bool hasTransposeApply() const override {
127 return true;
128 }
129
131 Teuchos::RCP<const map_type> getDomainMap() const override {
132 return matrix_->getDomainMap();
133 }
134
136 Teuchos::RCP<const map_type> getRangeMap() const override {
137 return matrix_->getRangeMap();
138 }
139
141
142 protected:
144
146 const Teuchos::RCP<const crs_matrix_type> matrix_;
147
150
163 mutable Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > importMV_;
164
177 mutable Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > exportMV_;
178
181 void
184 Teuchos::ETransp mode,
186 Scalar beta) const {
187 using Teuchos::null;
188 using Teuchos::RCP;
189 using Teuchos::rcp;
192 using STS = Teuchos::ScalarTraits<Scalar>;
194
195 const size_t numVectors = X_in.getNumVectors();
196 // because of Views, it is difficult to determine if X and Y point to the same data.
197 // however, if they reference the exact same object, we will do the user the favor of copying X into new storage (with a warning)
198 // we ony need to do this if we have trivial importers; otherwise, we don't actually apply the operator from X into Y
199 RCP<const import_type> importer = matrix_->getGraph()->getImporter();
200 RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
201
202 // some parameters for below
203 const bool Y_is_replicated = !Y_in.isDistributed();
204 const bool Y_is_overwritten = (beta == STS::zero());
205 const int myRank = matrix_->getComm()->getRank();
206 if (Y_is_replicated && myRank != 0) {
207 beta = STS::zero();
208 }
209
210 // access X indirectly, in case we need to create temporary storage
212 // currently, cannot multiply from multivector of non-constant stride
213 if (!X_in.isConstantStride() && importer == null) {
214 // generate a strided copy of X_in
215 X = Teuchos::rcp(new MV(X_in, Teuchos::Copy));
216 } else {
217 // just temporary, so this non-owning RCP is okay
218 X = Teuchos::rcpFromRef(X_in);
219 }
220
221 // set up import/export temporary multivectors
222 if (importer != null) {
223 if (importMV_ != null && importMV_->getNumVectors() != numVectors) {
224 importMV_ = null;
225 }
226 if (importMV_ == null) {
227 importMV_ = rcp(new MV(matrix_->getColMap(), numVectors));
228 }
229 }
230 if (exporter != null) {
231 if (exportMV_ != null && exportMV_->getNumVectors() != numVectors) {
232 exportMV_ = null;
233 }
234 if (exportMV_ == null) {
235 exportMV_ = rcp(new MV(matrix_->getRowMap(), numVectors));
236 }
237 }
238
239 // If we have a non-trivial exporter, we must import elements that are permuted or are on other processors
240 if (exporter != null) {
241 exportMV_->doImport(X_in, *exporter, INSERT);
242 X = exportMV_; // multiply out of exportMV_
243 }
244
245 auto X_lcl = X->getLocalViewDevice(Access::ReadOnly);
246
247 // If we have a non-trivial importer, we must export elements that are permuted or belong to other processors
248 // We will compute solution into the to-be-exported MV; get a view
249 if (importer != null) {
250 // Beta is zero here, so we clobber Y_lcl
251 auto Y_lcl = importMV_->getLocalViewDevice(Access::OverwriteAll);
252
253 localMultiply_.apply(X_lcl, Y_lcl, mode, alpha, STS::zero());
254 if (Y_is_overwritten) {
255 Y_in.putScalar(STS::zero());
256 } else {
257 Y_in.scale(beta);
258 }
259 Y_in.doExport(*importMV_, *importer, ADD_ASSIGN);
260 }
261 // otherwise, multiply into Y
262 else {
263 // can't multiply in-situ; can't multiply into non-strided multivector
264 if (!Y_in.isConstantStride() || X.getRawPtr() == &Y_in) {
265 // generate a strided copy of Y
266 MV Y(Y_in, Teuchos::Copy);
269 Y_lcl = Y.getLocalViewDevice(Access::OverwriteAll);
270 else
271 Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
272
275 } else {
278 Y_lcl = Y_in.getLocalViewDevice(Access::OverwriteAll);
279 else
280 Y_lcl = Y_in.getLocalViewDevice(Access::ReadWrite);
281
283 }
284 }
285 // Handle case of rangemap being a local replicated map: in this case, sum contributions from each processor
286 if (Y_is_replicated) {
287 Y_in.reduce();
288 }
289 }
290
292 void
296 Scalar beta) const {
297 using Teuchos::NO_TRANS;
298 using Teuchos::RCP;
299 using Teuchos::rcp;
300 using Teuchos::rcp_const_cast;
301 using Teuchos::rcpFromRef;
305 typedef Teuchos::ScalarTraits<Scalar> STS;
307
308 if (alpha == STS::zero()) {
309 if (beta == STS::zero()) {
310 Y_in.putScalar(STS::zero());
311 } else if (beta != STS::one()) {
312 Y_in.scale(beta);
313 }
314 return;
315 }
316
317 // It's possible that X is a view of Y or vice versa. We don't
318 // allow this (apply() requires that X and Y not alias one
319 // another), but it's helpful to detect and work around this
320 // case. We don't try to to detect the more subtle cases (e.g.,
321 // one is a subview of the other, but their initial pointers
322 // differ). We only need to do this if this matrix's Import is
323 // trivial; otherwise, we don't actually apply the operator from
324 // X into Y.
325
326 RCP<const import_type> importer = matrix_->getGraph()->getImporter();
327 RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
328
329 // If beta == 0, then the output MV will be overwritten; none of
330 // its entries should be read. (Sparse BLAS semantics say that we
331 // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
332 // This matters if we need to do an Export operation; see below.
333 const bool Y_is_overwritten = (beta == STS::zero());
334
335 // We treat the case of a replicated MV output specially.
336 const bool Y_is_replicated =
337 (!Y_in.isDistributed() && matrix_->getComm()->getSize() != 1);
338
339 // This is part of the special case for replicated MV output.
340 // We'll let each process do its thing, but do an all-reduce at
341 // the end to sum up the results. Setting beta=0 on all
342 // processes but Proc 0 makes the math work out for the
343 // all-reduce. (This assumes that the replicated data is
344 // correctly replicated, so that the data are the same on all
345 // processes.)
346 if (Y_is_replicated && matrix_->getComm()->getRank() > 0) {
347 beta = STS::zero();
348 }
349
350 // Temporary MV for Import operation. After the block of code
351 // below, this will be an (Imported if necessary) column Map MV
352 // ready to give to localMultiply_.apply(...).
354 if (importer.is_null()) {
355 if (!X_in.isConstantStride()) {
356 // Not all sparse mat-vec kernels can handle an input MV with
357 // nonconstant stride correctly, so we have to copy it in that
358 // case into a constant stride MV. To make a constant stride
359 // copy of X_in, we force creation of the column (== domain)
360 // Map MV (if it hasn't already been created, else fetch the
361 // cached copy). This avoids creating a new MV each time.
362 RCP<MV> X_colMapNonConst = getColumnMapMultiVector(X_in, true);
365 } else {
366 // The domain and column Maps are the same, so do the local
367 // multiply using the domain Map input MV X_in.
369 }
370 } else { // need to Import source (multi)vector
371 ProfilingRegion regionImport("Tpetra::CrsMatrixMultiplyOp::apply: Import");
372
373 // We're doing an Import anyway, which will copy the relevant
374 // elements of the domain Map MV X_in into a separate column Map
375 // MV. Thus, we don't have to worry whether X_in is constant
376 // stride.
377 RCP<MV> X_colMapNonConst = getColumnMapMultiVector(X_in);
378
379 // Import from the domain Map MV to the column Map MV.
380 X_colMapNonConst->doImport(X_in, *importer, INSERT);
382 }
383
384 // Temporary MV for doExport (if needed), or for copying a
385 // nonconstant stride output MV into a constant stride MV. This
386 // is null if we don't need the temporary MV, that is, if the
387 // Export is trivial (null).
388 RCP<MV> Y_rowMap = getRowMapMultiVector(Y_in);
389
390 auto X_lcl = X_colMap->getLocalViewDevice(Access::ReadOnly);
391
392 // If we have a nontrivial Export object, we must perform an
393 // Export. In that case, the local multiply result will go into
394 // the row Map multivector. We don't have to make a
395 // constant-stride version of Y_in in this case, because we had to
396 // make a constant stride Y_rowMap MV and do an Export anyway.
397 if (!exporter.is_null()) {
398 auto Y_lcl = Y_rowMap->getLocalViewDevice(Access::OverwriteAll);
399
401 alpha, STS::zero());
402 {
403 ProfilingRegion regionExport("Tpetra::CrsMatrixMultiplyOp::apply: Export");
404
405 // If we're overwriting the output MV Y_in completely (beta
406 // == 0), then make sure that it is filled with zeros before
407 // we do the Export. Otherwise, the ADD combine mode will
408 // use data in Y_in, which is supposed to be zero.
409 if (Y_is_overwritten) {
410 Y_in.putScalar(STS::zero());
411 } else {
412 // Scale the output MV by beta, so that the Export sums in
413 // the mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
414 Y_in.scale(beta);
415 }
416 // Do the Export operation.
417 Y_in.doExport(*Y_rowMap, *exporter, ADD_ASSIGN);
418 }
419 } else { // Don't do an Export: row Map and range Map are the same.
420 //
421 // If Y_in does not have constant stride, or if the column Map
422 // MV aliases Y_in, then we can't let the kernel write directly
423 // to Y_in. Instead, we have to use the cached row (== range)
424 // Map MV as temporary storage.
425 //
426 // FIXME (mfh 05 Jun 2014, mfh 07 Dec 2018) This test for
427 // aliasing only tests if the user passed in the same
428 // MultiVector for both X and Y. It won't detect whether one
429 // MultiVector views the other. We should also check the
430 // MultiVectors' raw data pointers.
431 if (!Y_in.isConstantStride() || X_colMap.getRawPtr() == &Y_in) {
432 // Force creating the MV if it hasn't been created already.
433 // This will reuse a previously created cached MV.
434 Y_rowMap = getRowMapMultiVector(Y_in, true);
435
436 // If beta == 0, we don't need to copy Y_in into Y_rowMap,
437 // since we're overwriting it anyway.
438 if (beta != STS::zero()) {
440 }
443 Y_lcl = Y_rowMap->getLocalViewDevice(Access::OverwriteAll);
444 else
445 Y_lcl = Y_rowMap->getLocalViewDevice(Access::ReadWrite);
446
449 } else {
452 Y_lcl = Y_in.getLocalViewDevice(Access::OverwriteAll);
453 else
454 Y_lcl = Y_in.getLocalViewDevice(Access::ReadWrite);
455
457 }
458 }
459
460 // If the range Map is a locally replicated Map, sum up
461 // contributions from each process. We set beta = 0 on all
462 // processes but Proc 0 initially, so this will handle the scaling
463 // factor beta correctly.
464 if (Y_is_replicated) {
465 ProfilingRegion regionReduce("Tpetra::CrsMatrixMultiplyOp::apply: Reduce Y");
466 Y_in.reduce();
467 }
468 }
469
470 private:
490 Teuchos::RCP<MV>
492 const bool force = false) const {
493 using Teuchos::null;
494 using Teuchos::RCP;
495 using Teuchos::rcp;
497
498 const size_t numVecs = X_domainMap.getNumVectors();
499 RCP<const import_type> importer = matrix_->getGraph()->getImporter();
500 RCP<const map_type> colMap = matrix_->getColMap();
501
502 RCP<MV> X_colMap; // null by default
503
504 // If the Import object is trivial (null), then we don't need a
505 // separate column Map multivector. Just return null in that
506 // case. The caller is responsible for knowing not to use the
507 // returned null pointer.
508 //
509 // If the Import is nontrivial, then we do need a separate
510 // column Map multivector for the Import operation. Check in
511 // that case if we have to (re)create the column Map
512 // multivector.
513 if (!importer.is_null() || force) {
514 if (importMV_.is_null() || importMV_->getNumVectors() != numVecs) {
515 X_colMap = rcp(new MV(colMap, numVecs));
516
517 // Cache the newly created multivector for later reuse.
519 } else { // Yay, we can reuse the cached multivector!
520 X_colMap = importMV_;
521 // mfh 09 Jan 2013: We don't have to fill with zeros first,
522 // because the Import uses INSERT combine mode, which overwrites
523 // existing entries.
524 //
525 // X_colMap->putScalar (STS::zero ());
526 }
527 }
528 return X_colMap;
529 }
530
552 Teuchos::RCP<MV>
553 getRowMapMultiVector(const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Y_rangeMap,
554 const bool force = false) const {
555 using Teuchos::null;
556 using Teuchos::RCP;
557 using Teuchos::rcp;
558 typedef Export<LocalOrdinal, GlobalOrdinal, Node> export_type;
559
560 const size_t numVecs = Y_rangeMap.getNumVectors();
561 RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
562 RCP<const map_type> rowMap = matrix_->getRowMap();
563
564 RCP<MV> Y_rowMap; // null by default
565
566 // If the Export object is trivial (null), then we don't need a
567 // separate row Map multivector. Just return null in that case.
568 // The caller is responsible for knowing not to use the returned
569 // null pointer.
570 //
571 // If the Export is nontrivial, then we do need a separate row
572 // Map multivector for the Export operation. Check in that case
573 // if we have to (re)create the row Map multivector.
574 if (!exporter.is_null() || force) {
575 if (exportMV_.is_null() || exportMV_->getNumVectors() != numVecs) {
576 Y_rowMap = rcp(new MV(rowMap, numVecs));
577 exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse
578 } else { // Yay, we can reuse the cached multivector!
579 Y_rowMap = exportMV_;
580 }
581 }
582 return Y_rowMap;
583 }
584};
585
593template <class OpScalar,
594 class MatScalar,
595 class LocalOrdinal,
596 class GlobalOrdinal,
597 class Node>
598Teuchos::RCP<
599 CrsMatrixMultiplyOp<OpScalar, MatScalar, LocalOrdinal, GlobalOrdinal, Node> >
600createCrsMatrixMultiplyOp(const Teuchos::RCP<
603 GlobalOrdinal, Node>
604 op_type;
605 return Teuchos::rcp(new op_type(A));
606}
607
608} // end of namespace Tpetra
609
610#endif // TPETRA_CRSMATRIXMULTIPLYOP_HPP
Forward declaration of Tpetra::CrsMatrixMultiplyOp.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Stand-alone utility functions and macros.
A class for wrapping a CrsMatrix multiply in a Operator.
bool hasTransposeApply() const override
Whether this Operator's apply() method can apply the transpose or conjugate transpose.
void applyNonTranspose(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X_in, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y_in, Scalar alpha, Scalar beta) const
Apply the matrix (not its transpose) to X_in, producing Y_in.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > exportMV_
Row Map MultiVector used in apply().
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute Y = beta*Y + alpha*Op(A)*X, where Op(A) is either A, , or .
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this Operator.
void applyTranspose(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X_in, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y_in, Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Apply the transpose or conjugate transpose of the matrix to X_in, producing Y_in.
CrsMatrixMultiplyOp(const Teuchos::RCP< const crs_matrix_type > &A)
Constructor.
~CrsMatrixMultiplyOp() override=default
Destructor (virtual for memory safety of derived classes).
Teuchos::RCP< CrsMatrixMultiplyOp< OpScalar, MatScalar, LocalOrdinal, GlobalOrdinal, Node > > createCrsMatrixMultiplyOp(const Teuchos::RCP< const CrsMatrix< MatScalar, LocalOrdinal, GlobalOrdinal, Node > > &A)
Non-member function to create a CrsMatrixMultiplyOp.
const Teuchos::RCP< const crs_matrix_type > matrix_
The underlying CrsMatrix object.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > importMV_
Column Map MultiVector used in apply().
LocalCrsMatrixOperator< Scalar, MatScalar, typename crs_matrix_type::device_type > localMultiply_
Implementation of local sparse matrix-vector multiply.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this Operator.
Struct that holds views of the contents of a CrsMatrix.
One or more distributed dense vectors.
Abstract interface for operators (e.g., matrices and preconditioners).
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes)
@ INSERT
Insert new values that don't currently exist.