Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
11#define TPETRA_BLOCKCRSMATRIX_DEF_HPP
12
15
19#include "Tpetra_BlockMultiVector.hpp"
20#include "Tpetra_BlockView.hpp"
21
22#include "Teuchos_TimeMonitor.hpp"
23#ifdef HAVE_TPETRA_DEBUG
24#include <set>
25#endif // HAVE_TPETRA_DEBUG
26
27#include "KokkosSparse_spmv.hpp"
28
29namespace Tpetra {
30
31namespace Impl {
32
33template <typename T>
34struct BlockCrsRowStruct {
35 T totalNumEntries, totalNumBytes, maxRowLength;
36 KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() = default;
37 KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct& b) = default;
38 KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
39 : totalNumEntries(numEnt)
40 , totalNumBytes(numBytes)
41 , maxRowLength(rowLength) {}
42};
43
44template <typename T>
45static KOKKOS_INLINE_FUNCTION void operator+=(BlockCrsRowStruct<T>& a, const BlockCrsRowStruct<T>& b) {
46 a.totalNumEntries += b.totalNumEntries;
47 a.totalNumBytes += b.totalNumBytes;
48 a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
49}
50
51template <typename T, typename ExecSpace>
52struct BlockCrsReducer {
53 typedef BlockCrsReducer reducer;
54 typedef T value_type;
55 typedef Kokkos::View<value_type, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
56 value_type* value;
57
58 KOKKOS_INLINE_FUNCTION
59 BlockCrsReducer(value_type& val)
60 : value(&val) {}
61
62 KOKKOS_INLINE_FUNCTION void join(value_type& dst, value_type& src) const { dst += src; }
63 KOKKOS_INLINE_FUNCTION void join(value_type& dst, const value_type& src) const { dst += src; }
64 KOKKOS_INLINE_FUNCTION void init(value_type& val) const { val = value_type(); }
65 KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
66 KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
67};
68
69} // namespace Impl
70
71namespace { // (anonymous)
72
73// Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
74// version that takes two Kokkos::View arguments).
75template <class Scalar, class LO, class GO, class Node>
76class GetLocalDiagCopy {
77 public:
78 typedef typename Node::device_type device_type;
79 typedef size_t diag_offset_type;
80 typedef Kokkos::View<const size_t*, device_type,
81 Kokkos::MemoryUnmanaged>
82 diag_offsets_type;
83 typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
84 typedef typename global_graph_type::local_graph_device_type local_graph_device_type;
85 typedef typename local_graph_device_type::row_map_type row_offsets_type;
86 typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
87 typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
88 typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
89
90 // Constructor
91 GetLocalDiagCopy(const diag_type& diag,
92 const values_type& val,
93 const diag_offsets_type& diagOffsets,
94 const row_offsets_type& ptr,
95 const LO blockSize)
96 : diag_(diag)
97 , diagOffsets_(diagOffsets)
98 , ptr_(ptr)
99 , blockSize_(blockSize)
100 , offsetPerBlock_(blockSize_ * blockSize_)
101 , val_(val) {}
102
103 KOKKOS_INLINE_FUNCTION void
104 operator()(const LO& lclRowInd) const {
105 using Kokkos::ALL;
106
107 // Get row offset
108 const size_t absOffset = ptr_[lclRowInd];
109
110 // Get offset relative to start of row
111 const size_t relOffset = diagOffsets_[lclRowInd];
112
113 // Get the total offset
114 const size_t pointOffset = (absOffset + relOffset) * offsetPerBlock_;
115
116 // Get a view of the block. BCRS currently uses LayoutRight
117 // regardless of the device.
118 typedef Kokkos::View<const IST**, Impl::BlockCrsMatrixLittleBlockArrayLayout,
119 device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
120 const_little_block_type;
121 const_little_block_type D_in(val_.data() + pointOffset,
122 blockSize_, blockSize_);
123 auto D_out = Kokkos::subview(diag_, lclRowInd, ALL(), ALL());
124 COPY(D_in, D_out);
125 }
126
127 private:
128 diag_type diag_;
129 diag_offsets_type diagOffsets_;
130 row_offsets_type ptr_;
131 LO blockSize_;
132 LO offsetPerBlock_;
133 values_type val_;
134};
135} // namespace
136
137template <class Scalar, class LO, class GO, class Node>
138std::ostream&
139BlockCrsMatrix<Scalar, LO, GO, Node>::
140 markLocalErrorAndGetStream() {
141 *(this->localError_) = true;
142 if ((*errs_).is_null()) {
143 *errs_ = Teuchos::rcp(new std::ostringstream());
144 }
145 return **errs_;
146}
147
148template <class Scalar, class LO, class GO, class Node>
151 : dist_object_type(Teuchos::rcp(new map_type()))
152 , // nonnull, so DistObject doesn't throw
153 graph_(Teuchos::rcp(new map_type()), 0)
154 , // FIXME (mfh 16 May 2014) no empty ctor yet
155 blockSize_(static_cast<LO>(0))
156 , X_colMap_(new Teuchos::RCP<BMV>())
157 , // ptr to a null ptr
158 Y_rowMap_(new Teuchos::RCP<BMV>())
159 , // ptr to a null ptr
160 pointImporter_(new Teuchos::RCP<typename crs_graph_type::import_type>())
161 , offsetPerBlock_(0)
162 , localError_(new bool(false))
163 , errs_(new Teuchos::RCP<std::ostringstream>()) // ptr to a null ptr
164{
165}
166
167template <class Scalar, class LO, class GO, class Node>
170 const LO blockSize)
171 : dist_object_type(graph.getMap())
172 , graph_(graph)
173 , rowMeshMap_(*(graph.getRowMap()))
174 , blockSize_(blockSize)
175 , X_colMap_(new Teuchos::RCP<BMV>())
176 , // ptr to a null ptr
177 Y_rowMap_(new Teuchos::RCP<BMV>())
178 , // ptr to a null ptr
179 pointImporter_(new Teuchos::RCP<typename crs_graph_type::import_type>())
180 , offsetPerBlock_(blockSize * blockSize)
181 , localError_(new bool(false))
182 , errs_(new Teuchos::RCP<std::ostringstream>()) // ptr to a null ptr
183{
186 !graph_.isSorted(), std::invalid_argument,
187 "Tpetra::"
188 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
189 "rows (isSorted() is false). This class assumes sorted rows.");
190
191 graphRCP_ = Teuchos::rcpFromRef(graph_);
192
193 // Trick to test whether LO is nonpositive, without a compiler
194 // warning in case LO is unsigned (which is generally a bad idea
195 // anyway). I don't promise that the trick works, but it
196 // generally does with gcc at least, in my experience.
197 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
199 blockSizeIsNonpositive, std::invalid_argument,
200 "Tpetra::"
201 "BlockCrsMatrix constructor: The input blockSize = "
202 << blockSize << " <= 0. The block size must be positive.");
203
204 domainPointMap_ = BMV::makePointMap(*(graph.getDomainMap()), blockSize);
205 rangePointMap_ = BMV::makePointMap(*(graph.getRangeMap()), blockSize);
206
207 {
208 // These are rcp
209 const auto domainPointMap = getDomainMap();
210 const auto colPointMap = Teuchos::rcp(new typename BMV::map_type(BMV::makePointMap(*graph_.getColMap(), blockSize_)));
211 *pointImporter_ = Teuchos::rcp(new typename crs_graph_type::import_type(domainPointMap, colPointMap));
212 }
213 {
214 auto local_graph_h = graph.getLocalGraphHost();
215 auto ptr_h = local_graph_h.row_map;
216 ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
217 Kokkos::deep_copy(ptrHost_, ptr_h);
218
219 auto ind_h = local_graph_h.entries;
220 indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
221 // DEEP_COPY REVIEW - HOST-TO-HOST
222 Kokkos::deep_copy(indHost_, ind_h);
223
224 const auto numValEnt = ind_h.extent(0) * offsetPerBlock();
225 val_ = decltype(val_)(impl_scalar_type_dualview("val", numValEnt));
226 }
227}
228
229template <class Scalar, class LO, class GO, class Node>
232 const typename local_matrix_device_type::values_type& values,
233 const LO blockSize)
234 : dist_object_type(graph.getMap())
235 , graph_(graph)
236 , rowMeshMap_(*(graph.getRowMap()))
237 , blockSize_(blockSize)
238 , X_colMap_(new Teuchos::RCP<BMV>())
239 , // ptr to a null ptr
240 Y_rowMap_(new Teuchos::RCP<BMV>())
241 , // ptr to a null ptr
242 pointImporter_(new Teuchos::RCP<typename crs_graph_type::import_type>())
243 , offsetPerBlock_(blockSize * blockSize)
244 , localError_(new bool(false))
245 , errs_(new Teuchos::RCP<std::ostringstream>()) // ptr to a null ptr
246{
249 !graph_.isSorted(), std::invalid_argument,
250 "Tpetra::"
251 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
252 "rows (isSorted() is false). This class assumes sorted rows.");
253
254 graphRCP_ = Teuchos::rcpFromRef(graph_);
255
256 // Trick to test whether LO is nonpositive, without a compiler
257 // warning in case LO is unsigned (which is generally a bad idea
258 // anyway). I don't promise that the trick works, but it
259 // generally does with gcc at least, in my experience.
260 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
262 blockSizeIsNonpositive, std::invalid_argument,
263 "Tpetra::"
264 "BlockCrsMatrix constructor: The input blockSize = "
265 << blockSize << " <= 0. The block size must be positive.");
266
267 domainPointMap_ = BMV::makePointMap(*(graph.getDomainMap()), blockSize);
268 rangePointMap_ = BMV::makePointMap(*(graph.getRangeMap()), blockSize);
269
270 {
271 // These are rcp
272 const auto domainPointMap = getDomainMap();
273 const auto colPointMap = Teuchos::rcp(new typename BMV::map_type(BMV::makePointMap(*graph_.getColMap(), blockSize_)));
274 *pointImporter_ = Teuchos::rcp(new typename crs_graph_type::import_type(domainPointMap, colPointMap));
275 }
276 {
277 auto local_graph_h = graph.getLocalGraphHost();
278 auto ptr_h = local_graph_h.row_map;
279 ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
280 Kokkos::deep_copy(ptrHost_, ptr_h);
281
282 auto ind_h = local_graph_h.entries;
283 indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
284 Kokkos::deep_copy(indHost_, ind_h);
285
286 const auto numValEnt = ind_h.extent(0) * offsetPerBlock();
288 val_ = decltype(val_)(values);
289 }
290}
291
292template <class Scalar, class LO, class GO, class Node>
296 const map_type& rangePointMap,
297 const LO blockSize)
298 : dist_object_type(graph.getMap())
299 , graph_(graph)
300 , rowMeshMap_(*(graph.getRowMap()))
301 , domainPointMap_(domainPointMap)
302 , rangePointMap_(rangePointMap)
303 , blockSize_(blockSize)
304 , X_colMap_(new Teuchos::RCP<BMV>())
305 , // ptr to a null ptr
306 Y_rowMap_(new Teuchos::RCP<BMV>())
307 , // ptr to a null ptr
308 pointImporter_(new Teuchos::RCP<typename crs_graph_type::import_type>())
309 , offsetPerBlock_(blockSize * blockSize)
310 , localError_(new bool(false))
311 , errs_(new Teuchos::RCP<std::ostringstream>()) // ptr to a null ptr
312{
314 !graph_.isSorted(), std::invalid_argument,
315 "Tpetra::"
316 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
317 "rows (isSorted() is false). This class assumes sorted rows.");
318
319 graphRCP_ = Teuchos::rcpFromRef(graph_);
320
321 // Trick to test whether LO is nonpositive, without a compiler
322 // warning in case LO is unsigned (which is generally a bad idea
323 // anyway). I don't promise that the trick works, but it
324 // generally does with gcc at least, in my experience.
325 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
327 blockSizeIsNonpositive, std::invalid_argument,
328 "Tpetra::"
329 "BlockCrsMatrix constructor: The input blockSize = "
330 << blockSize << " <= 0. The block size must be positive.");
331 {
332 // These are rcp
333 const auto rcpDomainPointMap = getDomainMap();
334 const auto colPointMap = Teuchos::rcp(new typename BMV::map_type(BMV::makePointMap(*graph_.getColMap(), blockSize_)));
335 *pointImporter_ = Teuchos::rcp(new typename crs_graph_type::import_type(rcpDomainPointMap, colPointMap));
336 }
337 {
338 auto local_graph_h = graph.getLocalGraphHost();
339 auto ptr_h = local_graph_h.row_map;
340 ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
341 // DEEP_COPY REVIEW - HOST-TO-HOST
342 Kokkos::deep_copy(ptrHost_, ptr_h);
343
344 auto ind_h = local_graph_h.entries;
345 indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
346 // DEEP_COPY REVIEW - HOST-TO-HOST
347 Kokkos::deep_copy(indHost_, ind_h);
348
349 const auto numValEnt = ind_h.extent(0) * offsetPerBlock();
350 val_ = decltype(val_)(impl_scalar_type_dualview("val", numValEnt));
351 }
352}
353
354template <class Scalar, class LO, class GO, class Node>
355Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
357 getDomainMap() const { // Copy constructor of map_type does a shallow copy.
358 // We're only returning by RCP for backwards compatibility.
359 return Teuchos::rcp(new map_type(domainPointMap_));
360}
361
362template <class Scalar, class LO, class GO, class Node>
363Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
365 getRangeMap() const { // Copy constructor of map_type does a shallow copy.
366 // We're only returning by RCP for backwards compatibility.
367 return Teuchos::rcp(new map_type(rangePointMap_));
368}
369
370template <class Scalar, class LO, class GO, class Node>
371Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
373 getRowMap() const {
374 return graph_.getRowMap();
375}
376
377template <class Scalar, class LO, class GO, class Node>
378Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
380 getColMap() const {
381 return graph_.getColMap();
382}
383
384template <class Scalar, class LO, class GO, class Node>
387 getGlobalNumRows() const {
388 return graph_.getGlobalNumRows();
389}
390
391template <class Scalar, class LO, class GO, class Node>
392size_t
394 getLocalNumRows() const {
395 return graph_.getLocalNumRows();
396}
397
398template <class Scalar, class LO, class GO, class Node>
399size_t
402 return graph_.getLocalMaxNumRowEntries();
403}
404
405template <class Scalar, class LO, class GO, class Node>
407 apply(const mv_type& X,
408 mv_type& Y,
409 Teuchos::ETransp mode,
411 Scalar beta) const {
414 mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
415 std::invalid_argument,
416 "Tpetra::BlockCrsMatrix::apply: "
417 "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
418 "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
419
421 !X.isConstantStride() || !Y.isConstantStride(),
422 std::invalid_argument,
423 "Tpetra::BlockCrsMatrix::apply: "
424 "X and Y must both be constant stride");
425
426 BMV X_view;
427 BMV Y_view;
428 const LO blockSize = getBlockSize();
429 try {
430 X_view = BMV(X, *(graph_.getDomainMap()), blockSize);
431 Y_view = BMV(Y, *(graph_.getRangeMap()), blockSize);
432 } catch (std::invalid_argument& e) {
434 true, std::invalid_argument,
435 "Tpetra::BlockCrsMatrix::"
436 "apply: Either the input MultiVector X or the output MultiVector Y "
437 "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
438 "graph. BlockMultiVector's constructor threw the following exception: "
439 << e.what());
440 }
441
442 try {
443 // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
444 // either that or mark fields of this class as 'mutable'. The
445 // problem is that applyBlock wants to do lazy initialization of
446 // temporary block multivectors.
447 const_cast<this_BCRS_type&>(*this).applyBlock(X_view, Y_view, mode, alpha, beta);
448 } catch (std::invalid_argument& e) {
450 true, std::invalid_argument,
451 "Tpetra::BlockCrsMatrix::"
452 "apply: The implementation method applyBlock complained about having "
453 "an invalid argument. It reported the following: "
454 << e.what());
455 } catch (std::logic_error& e) {
457 true, std::invalid_argument,
458 "Tpetra::BlockCrsMatrix::"
459 "apply: The implementation method applyBlock complained about a "
460 "possible bug in its implementation. It reported the following: "
461 << e.what());
462 } catch (std::exception& e) {
464 true, std::invalid_argument,
465 "Tpetra::BlockCrsMatrix::"
466 "apply: The implementation method applyBlock threw an exception which "
467 "is neither std::invalid_argument nor std::logic_error, but is a "
468 "subclass of std::exception. It reported the following: "
469 << e.what());
470 } catch (...) {
472 true, std::logic_error,
473 "Tpetra::BlockCrsMatrix::"
474 "apply: The implementation method applyBlock threw an exception which "
475 "is not an instance of a subclass of std::exception. This probably "
476 "indicates a bug. Please report this to the Tpetra developers.");
477 }
478}
479
480template <class Scalar, class LO, class GO, class Node>
484 Teuchos::ETransp mode,
485 const Scalar alpha,
486 const Scalar beta) {
488 X.getBlockSize() != Y.getBlockSize(), std::invalid_argument,
489 "Tpetra::BlockCrsMatrix::applyBlock: "
490 "X and Y have different block sizes. X.getBlockSize() = "
491 << X.getBlockSize() << " != Y.getBlockSize() = "
492 << Y.getBlockSize() << ".");
493
494 if (mode == Teuchos::NO_TRANS) {
495 applyBlockNoTrans(X, Y, alpha, beta);
496 } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
497 applyBlockTrans(X, Y, mode, alpha, beta);
498 } else {
500 true, std::invalid_argument,
501 "Tpetra::BlockCrsMatrix::"
502 "applyBlock: Invalid 'mode' argument. Valid values are "
503 "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
504 }
505}
506
507template <class Scalar, class LO, class GO, class Node>
510 const Import<LO, GO, Node>& importer) const {
511 using Teuchos::RCP;
512 using Teuchos::rcp;
514
515 // Right now, we make many assumptions...
516 TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
517 "destMatrix is required to be null.");
518
519 // BlockCrsMatrix requires a complete graph at construction.
520 // So first step is to import and fill complete the destGraph.
521 RCP<crs_graph_type> srcGraph = rcp(new crs_graph_type(this->getCrsGraph()));
523 srcGraph->getDomainMap(),
524 srcGraph->getRangeMap());
525
526 // Final step, create and import the destMatrix.
527 destMatrix = rcp(new this_BCRS_type(*destGraph, getBlockSize()));
528 destMatrix->doImport(*this, importer, Tpetra::INSERT);
529}
530
531template <class Scalar, class LO, class GO, class Node>
534 const Export<LO, GO, Node>& exporter) const {
535 using Teuchos::RCP;
536 using Teuchos::rcp;
538
539 // Right now, we make many assumptions...
540 TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
541 "destMatrix is required to be null.");
542
543 // BlockCrsMatrix requires a complete graph at construction.
544 // So first step is to import and fill complete the destGraph.
545 RCP<crs_graph_type> srcGraph = rcp(new crs_graph_type(this->getCrsGraph()));
547 srcGraph->getDomainMap(),
548 srcGraph->getRangeMap());
549
550 // Final step, create and import the destMatrix.
551 destMatrix = rcp(new this_BCRS_type(*destGraph, getBlockSize()));
552 destMatrix->doExport(*this, exporter, Tpetra::INSERT);
553}
554
555template <class Scalar, class LO, class GO, class Node>
558 auto val_d = val_.getDeviceView(Access::OverwriteAll);
559 Kokkos::deep_copy(val_d, alpha);
560}
561
562template <class Scalar, class LO, class GO, class Node>
565 const LO colInds[],
566 const Scalar vals[],
567 const LO numColInds) const {
568 std::vector<ptrdiff_t> offsets(numColInds);
569 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
570 const LO validCount = this->replaceLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
571 return validCount;
572}
573
574template <class Scalar, class LO, class GO, class Node>
576 getLocalDiagOffsets(const Kokkos::View<size_t*, device_type,
577 Kokkos::MemoryUnmanaged>& offsets) const {
578 graph_.getLocalDiagOffsets(offsets);
579}
580
581template <class Scalar, class LO, class GO, class Node>
583 getLocalDiagCopy(const Kokkos::View<impl_scalar_type***, device_type,
584 Kokkos::MemoryUnmanaged>& diag,
585 const Kokkos::View<const size_t*, device_type,
586 Kokkos::MemoryUnmanaged>& offsets) const {
587 using Kokkos::parallel_for;
588 const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
589
590 const LO lclNumMeshRows = static_cast<LO>(rowMeshMap_.getLocalNumElements());
591 const LO blockSize = this->getBlockSize();
592 TEUCHOS_TEST_FOR_EXCEPTION(static_cast<LO>(diag.extent(0)) < lclNumMeshRows ||
593 static_cast<LO>(diag.extent(1)) < blockSize ||
594 static_cast<LO>(diag.extent(2)) < blockSize,
595 std::invalid_argument, prefix << "The input Kokkos::View is not big enough to hold all the data.");
596 TEUCHOS_TEST_FOR_EXCEPTION(static_cast<LO>(offsets.size()) < lclNumMeshRows, std::invalid_argument,
597 prefix << "offsets.size() = " << offsets.size() << " < local number of "
598 "diagonal blocks "
599 << lclNumMeshRows << ".");
600
601 typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
603
604 // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
605 // we reserve the right to do lazy allocation of device data. (We
606 // don't plan to do lazy allocation for host data; the host
607 // version of the data always exists.)
608 auto val_d = val_.getDeviceView(Access::ReadOnly);
609 parallel_for(policy_type(0, lclNumMeshRows),
610 functor_type(diag, val_d, offsets,
611 graph_.getLocalGraphDevice().row_map, blockSize_));
612}
613
614template <class Scalar, class LO, class GO, class Node>
617 const LO colInds[],
618 const Scalar vals[],
619 const LO numColInds) const {
620 std::vector<ptrdiff_t> offsets(numColInds);
621 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
622 const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
623 return validCount;
624}
625
626template <class Scalar, class LO, class GO, class Node>
629 const LO colInds[],
630 const Scalar vals[],
631 const LO numColInds) const {
632 std::vector<ptrdiff_t> offsets(numColInds);
633 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
634 const LO validCount = this->sumIntoLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
635 return validCount;
636}
637template <class Scalar, class LO, class GO, class Node>
640 nonconst_local_inds_host_view_type& Indices,
641 nonconst_values_host_view_type& Values,
642 size_t& NumEntries) const {
643 auto vals = getValuesHost(LocalRow);
644 const LO numInds = ptrHost_(LocalRow + 1) - ptrHost_(LocalRow);
645 if (numInds > (LO)Indices.extent(0) || numInds * blockSize_ * blockSize_ > (LO)Values.extent(0)) {
646 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
647 "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
648 << numInds << " row entries");
649 }
650 const LO* colInds = indHost_.data() + ptrHost_(LocalRow);
651 for (LO i = 0; i < numInds; ++i) {
652 Indices[i] = colInds[i];
653 }
654 for (LO i = 0; i < numInds * blockSize_ * blockSize_; ++i) {
655 Values[i] = vals[i];
656 }
658}
659
660template <class Scalar, class LO, class GO, class Node>
663 ptrdiff_t offsets[],
664 const LO colInds[],
665 const LO numColInds) const {
666 if (!rowMeshMap_.isNodeLocalElement(localRowInd)) {
667 // We got no offsets, because the input local row index is
668 // invalid on the calling process. That may not be an error, if
669 // numColInds is zero anyway; it doesn't matter. This is the
670 // advantage of returning the number of valid indices.
671 return static_cast<LO>(0);
672 }
673
674 const LO LINV = Teuchos::OrdinalTraits<LO>::invalid();
675 LO hint = 0; // Guess for the relative offset into the current row
676 LO validCount = 0; // number of valid column indices in colInds
677
678 for (LO k = 0; k < numColInds; ++k) {
679 const LO relBlockOffset =
680 this->findRelOffsetOfColumnIndex(localRowInd, colInds[k], hint);
681 if (relBlockOffset != LINV) {
682 offsets[k] = static_cast<ptrdiff_t>(relBlockOffset);
683 hint = relBlockOffset + 1;
684 ++validCount;
685 }
686 }
687 return validCount;
688}
689
690template <class Scalar, class LO, class GO, class Node>
693 const ptrdiff_t offsets[],
694 const Scalar vals[],
695 const LO numOffsets) const {
696 if (!rowMeshMap_.isNodeLocalElement(localRowInd)) {
697 // We modified no values, because the input local row index is
698 // invalid on the calling process. That may not be an error, if
699 // numColInds is zero anyway; it doesn't matter. This is the
700 // advantage of returning the number of valid indices.
701 return static_cast<LO>(0);
702 }
703 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*>(vals);
705 auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
706 impl_scalar_type* vOut = val_out.data();
707
708 const size_t perBlockSize = static_cast<LO>(offsetPerBlock());
709 const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid();
710 size_t pointOffset = 0; // Current offset into input values
711 LO validCount = 0; // number of valid offsets
712
713 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
714 const size_t blockOffset = offsets[k] * perBlockSize;
715 if (offsets[k] != STINV) {
716 little_block_type A_old = getNonConstLocalBlockFromInput(vOut, blockOffset);
717 const_little_block_type A_new = getConstLocalBlockFromInput(vIn, pointOffset);
718 COPY(A_new, A_old);
719 ++validCount;
720 }
721 }
722 return validCount;
723}
724
725template <class Scalar, class LO, class GO, class Node>
728 const ptrdiff_t offsets[],
729 const Scalar vals[],
730 const LO numOffsets) const {
731 if (!rowMeshMap_.isNodeLocalElement(localRowInd)) {
732 // We modified no values, because the input local row index is
733 // invalid on the calling process. That may not be an error, if
734 // numColInds is zero anyway; it doesn't matter. This is the
735 // advantage of returning the number of valid indices.
736 return static_cast<LO>(0);
737 }
738 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*>(vals);
739 auto val_out = getValuesHost(localRowInd);
740 impl_scalar_type* vOut = const_cast<impl_scalar_type*>(val_out.data());
741
742 const size_t perBlockSize = static_cast<LO>(offsetPerBlock());
743 const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid();
744 size_t pointOffset = 0; // Current offset into input values
745 LO validCount = 0; // number of valid offsets
746
747 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
748 const size_t blockOffset = offsets[k] * perBlockSize;
749 if (blockOffset != STINV) {
750 little_block_type A_old = getNonConstLocalBlockFromInput(vOut, blockOffset);
751 const_little_block_type A_new = getConstLocalBlockFromInput(vIn, pointOffset);
752 ::Tpetra::Impl::absMax(A_old, A_new);
753 ++validCount;
754 }
755 }
756 return validCount;
757}
758
759template <class Scalar, class LO, class GO, class Node>
762 const ptrdiff_t offsets[],
763 const Scalar vals[],
764 const LO numOffsets) const {
765 if (!rowMeshMap_.isNodeLocalElement(localRowInd)) {
766 // We modified no values, because the input local row index is
767 // invalid on the calling process. That may not be an error, if
768 // numColInds is zero anyway; it doesn't matter. This is the
769 // advantage of returning the number of valid indices.
770 return static_cast<LO>(0);
771 }
772 const impl_scalar_type ONE = static_cast<impl_scalar_type>(1.0);
773 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*>(vals);
775 auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
776 impl_scalar_type* vOut = val_out.data();
777
778 const size_t perBlockSize = static_cast<LO>(offsetPerBlock());
779 const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid();
780 size_t pointOffset = 0; // Current offset into input values
781 LO validCount = 0; // number of valid offsets
782
783 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
784 const size_t blockOffset = offsets[k] * perBlockSize;
785 if (blockOffset != STINV) {
786 little_block_type A_old = getNonConstLocalBlockFromInput(vOut, blockOffset);
787 const_little_block_type A_new = getConstLocalBlockFromInput(vIn, pointOffset);
788 AXPY(ONE, A_new, A_old);
789 ++validCount;
790 }
791 }
792 return validCount;
793}
794
795template <class Scalar, class LO, class GO, class Node>
797 impl_scalar_type_dualview::t_host::const_type
799 getValuesHost() const {
800 return val_.getHostView(Access::ReadOnly);
801}
802
803template <class Scalar, class LO, class GO, class Node>
804typename BlockCrsMatrix<Scalar, LO, GO, Node>::
805 impl_scalar_type_dualview::t_dev::const_type
806 BlockCrsMatrix<Scalar, LO, GO, Node>::
807 getValuesDevice() const {
808 return val_.getDeviceView(Access::ReadOnly);
809}
810
811template <class Scalar, class LO, class GO, class Node>
812typename BlockCrsMatrix<Scalar, LO, GO, Node>::
813 impl_scalar_type_dualview::t_host
816 return val_.getHostView(Access::ReadWrite);
817}
818
819template <class Scalar, class LO, class GO, class Node>
821 impl_scalar_type_dualview::t_dev
824 return val_.getDeviceView(Access::ReadWrite);
825}
826
827template <class Scalar, class LO, class GO, class Node>
828typename BlockCrsMatrix<Scalar, LO, GO, Node>::
829 impl_scalar_type_dualview::t_host::const_type
831 getValuesHost(const LO& lclRow) const {
832 const size_t perBlockSize = static_cast<LO>(offsetPerBlock());
833 auto val = val_.getHostView(Access::ReadOnly);
834 auto r_val = Kokkos::subview(val, Kokkos::pair<LO, LO>(ptrHost_(lclRow) * perBlockSize, ptrHost_(lclRow + 1) * perBlockSize));
835 return r_val;
836}
837
838template <class Scalar, class LO, class GO, class Node>
840 impl_scalar_type_dualview::t_dev::const_type
842 getValuesDevice(const LO& lclRow) const {
843 const size_t perBlockSize = static_cast<LO>(offsetPerBlock());
844 auto val = val_.getDeviceView(Access::ReadOnly);
845 auto r_val = Kokkos::subview(val, Kokkos::pair<LO, LO>(ptrHost_(lclRow) * perBlockSize, ptrHost_(lclRow + 1) * perBlockSize));
846 return r_val;
847}
848
849template <class Scalar, class LO, class GO, class Node>
853 const size_t perBlockSize = static_cast<LO>(offsetPerBlock());
854 auto val = val_.getHostView(Access::ReadWrite);
855 auto r_val = Kokkos::subview(val, Kokkos::pair<LO, LO>(ptrHost_(lclRow) * perBlockSize, ptrHost_(lclRow + 1) * perBlockSize));
856 return r_val;
857}
858
859template <class Scalar, class LO, class GO, class Node>
863 const size_t perBlockSize = static_cast<LO>(offsetPerBlock());
864 auto val = val_.getDeviceView(Access::ReadWrite);
865 auto r_val = Kokkos::subview(val, Kokkos::pair<LO, LO>(ptrHost_(lclRow) * perBlockSize, ptrHost_(lclRow + 1) * perBlockSize));
866 return r_val;
867}
868
869template <class Scalar, class LO, class GO, class Node>
870size_t
873 const size_t numEntInGraph = graph_.getNumEntriesInLocalRow(localRowInd);
874 if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid()) {
875 return static_cast<LO>(0); // the calling process doesn't have that row
876 } else {
877 return static_cast<LO>(numEntInGraph);
878 }
879}
880
881template <class Scalar, class LO, class GO, class Node>
882typename BlockCrsMatrix<Scalar, LO, GO, Node>::local_matrix_device_type
884 getLocalMatrixDevice() const {
885 auto numCols = this->graph_.getColMap()->getLocalNumElements();
886 auto val = val_.getDeviceView(Access::ReadWrite);
887 const LO blockSize = this->getBlockSize();
888 const auto graph = this->graph_.getLocalGraphDevice();
889
890 return local_matrix_device_type("Tpetra::BlockCrsMatrix::lclMatrixDevice",
891 numCols,
892 val,
893 graph,
894 blockSize);
895}
896
897template <class Scalar, class LO, class GO, class Node>
901 const Teuchos::ETransp mode,
902 const Scalar alpha,
903 const Scalar beta) {
904 (void)X;
905 (void)Y;
906 (void)mode;
907 (void)alpha;
908 (void)beta;
909
911 true, std::logic_error,
912 "Tpetra::BlockCrsMatrix::apply: "
913 "transpose and conjugate transpose modes are not yet implemented.");
914}
915
916template <class Scalar, class LO, class GO, class Node>
917void BlockCrsMatrix<Scalar, LO, GO, Node>::
918 applyBlockNoTrans(const BlockMultiVector<Scalar, LO, GO, Node>& X,
919 BlockMultiVector<Scalar, LO, GO, Node>& Y,
920 const Scalar alpha,
921 const Scalar beta) {
922 using Teuchos::RCP;
923 using Teuchos::rcp;
924 typedef ::Tpetra::Import<LO, GO, Node> import_type;
925 typedef ::Tpetra::Export<LO, GO, Node> export_type;
926 const Scalar zero = STS::zero();
927 const Scalar one = STS::one();
928 RCP<const import_type> import = graph_.getImporter();
929 // "export" is a reserved C++ keyword, so we can't use it.
930 RCP<const export_type> theExport = graph_.getExporter();
931 const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
932
933 if (alpha == zero) {
934 if (beta == zero) {
935 Y.putScalar(zero); // replace Inf or NaN (BLAS rules)
936 } else if (beta != one) {
937 Y.scale(beta);
938 }
939 } else { // alpha != 0
940 const BMV* X_colMap = NULL;
941 if (import.is_null()) {
942 try {
943 X_colMap = &X;
944 } catch (std::exception& e) {
945 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, prefix << "Tpetra::MultiVector::"
946 "operator= threw an exception: "
947 << std::endl
948 << e.what());
949 }
950 } else {
951 // X_colMap_ is a pointer to a pointer to BMV. Ditto for
952 // Y_rowMap_ below. This lets us do lazy initialization
953 // correctly with view semantics of BlockCrsMatrix. All views
954 // of this BlockCrsMatrix have the same outer pointer. That
955 // way, we can set the inner pointer in one view, and all
956 // other views will see it.
957 if ((*X_colMap_).is_null() ||
958 (**X_colMap_).getNumVectors() != X.getNumVectors() ||
959 (**X_colMap_).getBlockSize() != X.getBlockSize()) {
960 *X_colMap_ = rcp(new BMV(*(graph_.getColMap()), getBlockSize(),
961 static_cast<LO>(X.getNumVectors())));
962 }
963 (*X_colMap_)->getMultiVectorView().doImport(X.getMultiVectorView(), **pointImporter_, ::Tpetra::REPLACE);
964 try {
965 X_colMap = &(**X_colMap_);
966 } catch (std::exception& e) {
967 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, prefix << "Tpetra::MultiVector::"
968 "operator= threw an exception: "
969 << std::endl
970 << e.what());
971 }
972 }
973
974 BMV* Y_rowMap = NULL;
975 if (theExport.is_null()) {
976 Y_rowMap = &Y;
977 } else if ((*Y_rowMap_).is_null() ||
978 (**Y_rowMap_).getNumVectors() != Y.getNumVectors() ||
979 (**Y_rowMap_).getBlockSize() != Y.getBlockSize()) {
980 *Y_rowMap_ = rcp(new BMV(*(graph_.getRowMap()), getBlockSize(),
981 static_cast<LO>(X.getNumVectors())));
982 try {
983 Y_rowMap = &(**Y_rowMap_);
984 } catch (std::exception& e) {
985 TEUCHOS_TEST_FOR_EXCEPTION(
986 true, std::logic_error, prefix << "Tpetra::MultiVector::"
987 "operator= threw an exception: "
988 << std::endl
989 << e.what());
990 }
991 }
992
993 try {
994 localApplyBlockNoTrans(*X_colMap, *Y_rowMap, alpha, beta);
995 } catch (std::exception& e) {
996 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
997 "an exception: "
998 << e.what());
999 } catch (...) {
1000 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1001 "an exception not a subclass of std::exception.");
1002 }
1003
1004 if (!theExport.is_null()) {
1005 Y.doExport(*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1006 }
1007 }
1008}
1009
1010template <class Scalar, class LO, class GO, class Node>
1011void BlockCrsMatrix<Scalar, LO, GO, Node>::localApplyBlockNoTrans(
1012 const BlockMultiVector<Scalar, LO, GO, Node>& X,
1013 BlockMultiVector<Scalar, LO, GO, Node>& Y, const Scalar alpha,
1014 const Scalar beta) {
1016 "Tpetra::BlockCrsMatrix::localApplyBlockNoTrans");
1017 const impl_scalar_type alpha_impl = alpha;
1018 const auto graph = this->graph_.getLocalGraphDevice();
1019
1020 mv_type X_mv = X.getMultiVectorView();
1021 mv_type Y_mv = Y.getMultiVectorView();
1022 auto X_lcl = X_mv.getLocalViewDevice(Access::ReadOnly);
1023 auto Y_lcl = Y_mv.getLocalViewDevice(Access::ReadWrite);
1024
1025 auto A_lcl = getLocalMatrixDevice();
1026 if (!applyHelper.get()) {
1027 // The apply helper does not exist, so create it
1028 applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map);
1029 }
1030 if (applyHelper->shouldUseIntRowptrs()) {
1031 auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
1032 KokkosSparse::spmv(
1033 &applyHelper->handle_int, KokkosSparse::NoTranspose,
1034 alpha_impl, A_lcl_int_rowptrs, X_lcl, beta, Y_lcl);
1035 } else {
1036 KokkosSparse::spmv(
1037 &applyHelper->handle, KokkosSparse::NoTranspose,
1038 alpha_impl, A_lcl, X_lcl, beta, Y_lcl);
1039 }
1040}
1041
1042template <class Scalar, class LO, class GO, class Node>
1043LO BlockCrsMatrix<Scalar, LO, GO, Node>::
1044 findRelOffsetOfColumnIndex(const LO localRowIndex,
1045 const LO colIndexToFind,
1046 const LO hint) const {
1047 const size_t absStartOffset = ptrHost_[localRowIndex];
1048 const size_t absEndOffset = ptrHost_[localRowIndex + 1];
1049 const LO numEntriesInRow = static_cast<LO>(absEndOffset - absStartOffset);
1050 // Amortize pointer arithmetic over the search loop.
1051 const LO* const curInd = indHost_.data() + absStartOffset;
1052
1053 // If the hint was correct, then the hint is the offset to return.
1054 if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1055 // Always return the offset relative to the current row.
1056 return hint;
1057 }
1058
1059 // The hint was wrong, so we must search for the given column
1060 // index in the column indices for the given row.
1061 LO relOffset = Teuchos::OrdinalTraits<LO>::invalid();
1062
1063 // We require that the graph have sorted rows. However, binary
1064 // search only pays if the current row is longer than a certain
1065 // amount. We set this to 32, but you might want to tune this.
1066 const LO maxNumEntriesForLinearSearch = 32;
1067 if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1068 // Use binary search. It would probably be better for us to
1069 // roll this loop by hand. If we wrote it right, a smart
1070 // compiler could perhaps use conditional loads and avoid
1071 // branches (according to Jed Brown on May 2014).
1072 const LO* beg = curInd;
1073 const LO* end = curInd + numEntriesInRow;
1074 std::pair<const LO*, const LO*> p =
1075 std::equal_range(beg, end, colIndexToFind);
1076 if (p.first != p.second) {
1077 // offset is relative to the current row
1078 relOffset = static_cast<LO>(p.first - beg);
1079 }
1080 } else { // use linear search
1081 for (LO k = 0; k < numEntriesInRow; ++k) {
1082 if (colIndexToFind == curInd[k]) {
1083 relOffset = k; // offset is relative to the current row
1084 break;
1085 }
1086 }
1087 }
1088
1089 return relOffset;
1090}
1091
1092template <class Scalar, class LO, class GO, class Node>
1093LO BlockCrsMatrix<Scalar, LO, GO, Node>::
1094 offsetPerBlock() const {
1095 return offsetPerBlock_;
1096}
1097
1098template <class Scalar, class LO, class GO, class Node>
1100BlockCrsMatrix<Scalar, LO, GO, Node>::
1101 getConstLocalBlockFromInput(const impl_scalar_type* val,
1102 const size_t pointOffset) const {
1103 // Row major blocks
1104 const LO rowStride = blockSize_;
1105 return const_little_block_type(val + pointOffset, blockSize_, rowStride);
1106}
1107
1108template <class Scalar, class LO, class GO, class Node>
1110BlockCrsMatrix<Scalar, LO, GO, Node>::
1111 getNonConstLocalBlockFromInput(impl_scalar_type* val,
1112 const size_t pointOffset) const {
1113 // Row major blocks
1114 const LO rowStride = blockSize_;
1115 return little_block_type(val + pointOffset, blockSize_, rowStride);
1116}
1117
1118template <class Scalar, class LO, class GO, class Node>
1119typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1120BlockCrsMatrix<Scalar, LO, GO, Node>::
1121 getNonConstLocalBlockFromInputHost(impl_scalar_type* val,
1122 const size_t pointOffset) const {
1123 // Row major blocks
1124 const LO rowStride = blockSize_;
1125 const size_t bs2 = blockSize_ * blockSize_;
1126 return little_block_host_type(val + bs2 * pointOffset, blockSize_, rowStride);
1127}
1128
1129template <class Scalar, class LO, class GO, class Node>
1131BlockCrsMatrix<Scalar, LO, GO, Node>::
1132 getLocalBlockDeviceNonConst(const LO localRowInd, const LO localColInd) const {
1133 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1134
1135 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1136 const LO relBlockOffset = this->findRelOffsetOfColumnIndex(localRowInd, localColInd);
1137 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid()) {
1138 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1139 auto vals = const_cast<this_BCRS_type&>(*this).getValuesDeviceNonConst();
1140 auto r_val = getNonConstLocalBlockFromInput(vals.data(), absBlockOffset);
1141 return r_val;
1142 } else {
1143 return little_block_type();
1144 }
1145}
1146
1147template <class Scalar, class LO, class GO, class Node>
1148typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1149BlockCrsMatrix<Scalar, LO, GO, Node>::
1150 getLocalBlockHostNonConst(const LO localRowInd, const LO localColInd) const {
1151 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1152
1153 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1154 const LO relBlockOffset = this->findRelOffsetOfColumnIndex(localRowInd, localColInd);
1155 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid()) {
1156 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1157 auto vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst();
1158 auto r_val = getNonConstLocalBlockFromInputHost(vals.data(), absBlockOffset);
1159 return r_val;
1160 } else {
1161 return little_block_host_type();
1162 }
1163}
1164
1165template <class Scalar, class LO, class GO, class Node>
1166bool BlockCrsMatrix<Scalar, LO, GO, Node>::
1167 checkSizes(const ::Tpetra::SrcDistObject& source) {
1168 using std::endl;
1169 typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1170 const this_BCRS_type* src = dynamic_cast<const this_BCRS_type*>(&source);
1171
1172 if (src == NULL) {
1173 std::ostream& err = markLocalErrorAndGetStream();
1174 err << "checkSizes: The source object of the Import or Export "
1175 "must be a BlockCrsMatrix with the same template parameters as the "
1176 "target object."
1177 << endl;
1178 } else {
1179 // Use a string of ifs, not if-elseifs, because we want to know
1180 // all the errors.
1181 if (src->getBlockSize() != this->getBlockSize()) {
1182 std::ostream& err = markLocalErrorAndGetStream();
1183 err << "checkSizes: The source and target objects of the Import or "
1184 << "Export must have the same block sizes. The source's block "
1185 << "size = " << src->getBlockSize() << " != the target's block "
1186 << "size = " << this->getBlockSize() << "." << endl;
1187 }
1188 if (!src->graph_.isFillComplete()) {
1189 std::ostream& err = markLocalErrorAndGetStream();
1190 err << "checkSizes: The source object of the Import or Export is "
1191 "not fill complete. Both source and target objects must be fill "
1192 "complete."
1193 << endl;
1194 }
1195 if (!this->graph_.isFillComplete()) {
1196 std::ostream& err = markLocalErrorAndGetStream();
1197 err << "checkSizes: The target object of the Import or Export is "
1198 "not fill complete. Both source and target objects must be fill "
1199 "complete."
1200 << endl;
1201 }
1202 if (src->graph_.getColMap().is_null()) {
1203 std::ostream& err = markLocalErrorAndGetStream();
1204 err << "checkSizes: The source object of the Import or Export does "
1205 "not have a column Map. Both source and target objects must have "
1206 "column Maps."
1207 << endl;
1208 }
1209 if (this->graph_.getColMap().is_null()) {
1210 std::ostream& err = markLocalErrorAndGetStream();
1211 err << "checkSizes: The target object of the Import or Export does "
1212 "not have a column Map. Both source and target objects must have "
1213 "column Maps."
1214 << endl;
1215 }
1216 }
1217
1218 return !(*(this->localError_));
1219}
1220
1221template <class Scalar, class LO, class GO, class Node>
1223 copyAndPermute(const ::Tpetra::SrcDistObject& source,
1224 const size_t numSameIDs,
1225 const Kokkos::DualView<const local_ordinal_type*,
1227 const Kokkos::DualView<const local_ordinal_type*,
1229 const CombineMode /*CM*/) {
1230 using std::endl;
1231 using ::Tpetra::Details::Behavior;
1232 using ::Tpetra::Details::dualViewStatusToString;
1233 using ::Tpetra::Details::ProfilingRegion;
1235
1236 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
1237 const bool debug = Behavior::debug();
1238 const bool verbose = Behavior::verbose();
1239
1240 // Define this function prefix
1241 std::string prefix;
1242 {
1243 std::ostringstream os;
1244 const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1245 os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
1246 prefix = os.str();
1247 }
1248
1249 // check if this already includes a local error
1250 if (*(this->localError_)) {
1251 std::ostream& err = this->markLocalErrorAndGetStream();
1252 err << prefix
1253 << "The target object of the Import or Export is already in an error state."
1254 << endl;
1255 return;
1256 }
1257
1258 //
1259 // Verbose input dual view status
1260 //
1261 if (verbose) {
1262 std::ostringstream os;
1263 os << prefix << endl
1264 << prefix << " " << dualViewStatusToString(permuteToLIDs, "permuteToLIDs") << endl
1265 << prefix << " " << dualViewStatusToString(permuteFromLIDs, "permuteFromLIDs") << endl;
1266 std::cerr << os.str();
1267 }
1268
1272 if (permuteToLIDs.extent(0) != permuteFromLIDs.extent(0)) {
1273 std::ostream& err = this->markLocalErrorAndGetStream();
1274 err << prefix
1275 << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent(0)
1276 << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
1277 << "." << endl;
1278 return;
1279 }
1280 if (permuteToLIDs.need_sync_host() || permuteFromLIDs.need_sync_host()) {
1281 std::ostream& err = this->markLocalErrorAndGetStream();
1282 err << prefix
1283 << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
1284 << endl;
1285 return;
1286 }
1287
1288 const this_BCRS_type* src = dynamic_cast<const this_BCRS_type*>(&source);
1289 if (src == NULL) {
1290 std::ostream& err = this->markLocalErrorAndGetStream();
1291 err << prefix
1292 << "The source (input) object of the Import or "
1293 "Export is either not a BlockCrsMatrix, or does not have the right "
1294 "template parameters. checkSizes() should have caught this. "
1295 "Please report this bug to the Tpetra developers."
1296 << endl;
1297 return;
1298 }
1299
1300 bool lclErr = false;
1301#ifdef HAVE_TPETRA_DEBUG
1302 std::set<LO> invalidSrcCopyRows;
1303 std::set<LO> invalidDstCopyRows;
1304 std::set<LO> invalidDstCopyCols;
1305 std::set<LO> invalidDstPermuteCols;
1306 std::set<LO> invalidPermuteFromRows;
1307#endif // HAVE_TPETRA_DEBUG
1308
1309 // Copy the initial sequence of rows that are the same.
1310 //
1311 // The two graphs might have different column Maps, so we need to
1312 // do this using global column indices. This is purely local, so
1313 // we only need to check for local sameness of the two column
1314 // Maps.
1315
1316#ifdef HAVE_TPETRA_DEBUG
1317 const map_type& srcRowMap = *(src->graph_.getRowMap());
1318#endif // HAVE_TPETRA_DEBUG
1319 const map_type& dstRowMap = *(this->graph_.getRowMap());
1320 const map_type& srcColMap = *(src->graph_.getColMap());
1321 const map_type& dstColMap = *(this->graph_.getColMap());
1322 const bool canUseLocalColumnIndices = srcColMap.locallySameAs(dstColMap);
1323
1324 const size_t numPermute = static_cast<size_t>(permuteFromLIDs.extent(0));
1325 if (verbose) {
1326 std::ostringstream os;
1327 os << prefix
1328 << "canUseLocalColumnIndices: "
1329 << (canUseLocalColumnIndices ? "true" : "false")
1330 << ", numPermute: " << numPermute
1331 << endl;
1332 std::cerr << os.str();
1333 }
1334
1335 const auto permuteToLIDsHost = permuteToLIDs.view_host();
1336 const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
1337
1339 // Copy local rows that are the "same" in both source and target.
1340 for (LO localRow = 0; localRow < static_cast<LO>(numSameIDs); ++localRow) {
1341#ifdef HAVE_TPETRA_DEBUG
1342 if (!srcRowMap.isNodeLocalElement(localRow)) {
1343 lclErr = true;
1344 invalidSrcCopyRows.insert(localRow);
1345 continue; // skip invalid rows
1346 }
1347#endif // HAVE_TPETRA_DEBUG
1348
1349 local_inds_host_view_type lclSrcCols;
1350 values_host_view_type vals;
1351 LO numEntries;
1352 // If this call fails, that means the mesh row local index is
1353 // invalid. That means the Import or Export is invalid somehow.
1354 src->getLocalRowView(localRow, lclSrcCols, vals);
1355 numEntries = lclSrcCols.extent(0);
1356 if (numEntries > 0) {
1357 LO err = this->replaceLocalValues(localRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1358 if (err != numEntries) {
1359 lclErr = true;
1360 if (!dstRowMap.isNodeLocalElement(localRow)) {
1361#ifdef HAVE_TPETRA_DEBUG
1362 invalidDstCopyRows.insert(localRow);
1363#endif // HAVE_TPETRA_DEBUG
1364 } else {
1365 // Once there's an error, there's no sense in saving
1366 // time, so we check whether the column indices were
1367 // invalid. However, only remember which ones were
1368 // invalid in a debug build, because that might take a
1369 // lot of space.
1370 for (LO k = 0; k < numEntries; ++k) {
1371 if (!dstColMap.isNodeLocalElement(lclSrcCols[k])) {
1372 lclErr = true;
1373#ifdef HAVE_TPETRA_DEBUG
1375#endif // HAVE_TPETRA_DEBUG
1376 }
1377 }
1378 }
1379 }
1380 }
1381 } // for each "same" local row
1382
1383 // Copy the "permute" local rows.
1384 for (size_t k = 0; k < numPermute; ++k) {
1385 const LO srcLclRow = static_cast<LO>(permuteFromLIDsHost(k));
1386 const LO dstLclRow = static_cast<LO>(permuteToLIDsHost(k));
1387
1388 local_inds_host_view_type lclSrcCols;
1389 values_host_view_type vals;
1390 LO numEntries;
1391 src->getLocalRowView(srcLclRow, lclSrcCols, vals);
1392 numEntries = lclSrcCols.extent(0);
1393 if (numEntries > 0) {
1394 LO err = this->replaceLocalValues(dstLclRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1395 if (err != numEntries) {
1396 lclErr = true;
1397#ifdef HAVE_TPETRA_DEBUG
1398 for (LO c = 0; c < numEntries; ++c) {
1399 if (!dstColMap.isNodeLocalElement(lclSrcCols[c])) {
1401 }
1402 }
1403#endif // HAVE_TPETRA_DEBUG
1404 }
1405 }
1406 }
1407 } else { // must convert column indices to global
1408 // Reserve space to store the destination matrix's local column indices.
1409 const size_t maxNumEnt = src->graph_.getLocalMaxNumRowEntries();
1410 Teuchos::Array<LO> lclDstCols(maxNumEnt);
1411
1412 // Copy local rows that are the "same" in both source and target.
1413 for (LO localRow = 0; localRow < static_cast<LO>(numSameIDs); ++localRow) {
1414 local_inds_host_view_type lclSrcCols;
1415 values_host_view_type vals;
1416 LO numEntries;
1417
1418 // If this call fails, that means the mesh row local index is
1419 // invalid. That means the Import or Export is invalid somehow.
1420 try {
1421 src->getLocalRowView(localRow, lclSrcCols, vals);
1422 numEntries = lclSrcCols.extent(0);
1423 } catch (std::exception& e) {
1424 if (debug) {
1425 std::ostringstream os;
1426 const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1427 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1428 << localRow << ", src->getLocalRowView() threw an exception: "
1429 << e.what();
1430 std::cerr << os.str();
1431 }
1432 throw e;
1433 }
1434
1435 if (numEntries > 0) {
1436 if (static_cast<size_t>(numEntries) > static_cast<size_t>(lclDstCols.size())) {
1437 lclErr = true;
1438 if (debug) {
1439 std::ostringstream os;
1440 const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1441 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1442 << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
1443 << maxNumEnt << endl;
1444 std::cerr << os.str();
1445 }
1446 } else {
1447 // Convert the source matrix's local column indices to the
1448 // destination matrix's local column indices.
1449 Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view(0, numEntries);
1450 for (LO j = 0; j < numEntries; ++j) {
1451 lclDstColsView[j] = dstColMap.getLocalElement(srcColMap.getGlobalElement(lclSrcCols[j]));
1452 if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid()) {
1453 lclErr = true;
1454#ifdef HAVE_TPETRA_DEBUG
1456#endif // HAVE_TPETRA_DEBUG
1457 }
1458 }
1459 LO err(0);
1460 try {
1461 err = this->replaceLocalValues(localRow, lclDstColsView.getRawPtr(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1462 } catch (std::exception& e) {
1463 if (debug) {
1464 std::ostringstream os;
1465 const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1466 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1467 << localRow << ", this->replaceLocalValues() threw an exception: "
1468 << e.what();
1469 std::cerr << os.str();
1470 }
1471 throw e;
1472 }
1473 if (err != numEntries) {
1474 lclErr = true;
1475 if (debug) {
1476 std::ostringstream os;
1477 const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1478 os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
1479 "localRow "
1480 << localRow << ", this->replaceLocalValues "
1481 "returned "
1482 << err << " instead of numEntries = "
1483 << numEntries << endl;
1484 std::cerr << os.str();
1485 }
1486 }
1487 }
1488 }
1489 }
1490
1491 // Copy the "permute" local rows.
1492 for (size_t k = 0; k < numPermute; ++k) {
1493 const LO srcLclRow = static_cast<LO>(permuteFromLIDsHost(k));
1494 const LO dstLclRow = static_cast<LO>(permuteToLIDsHost(k));
1495
1496 local_inds_host_view_type lclSrcCols;
1497 values_host_view_type vals;
1498 LO numEntries;
1499
1500 try {
1501 src->getLocalRowView(srcLclRow, lclSrcCols, vals);
1502 numEntries = lclSrcCols.extent(0);
1503 } catch (std::exception& e) {
1504 if (debug) {
1505 std::ostringstream os;
1506 const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1507 os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
1508 "srcLclRow "
1509 << srcLclRow << " and dstLclRow " << dstLclRow
1510 << ", src->getLocalRowView() threw an exception: " << e.what();
1511 std::cerr << os.str();
1512 }
1513 throw e;
1514 }
1515
1516 if (numEntries > 0) {
1517 if (static_cast<size_t>(numEntries) > static_cast<size_t>(lclDstCols.size())) {
1518 lclErr = true;
1519 } else {
1520 // Convert the source matrix's local column indices to the
1521 // destination matrix's local column indices.
1522 Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view(0, numEntries);
1523 for (LO j = 0; j < numEntries; ++j) {
1524 lclDstColsView[j] = dstColMap.getLocalElement(srcColMap.getGlobalElement(lclSrcCols[j]));
1525 if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid()) {
1526 lclErr = true;
1527#ifdef HAVE_TPETRA_DEBUG
1529#endif // HAVE_TPETRA_DEBUG
1530 }
1531 }
1532 LO err = this->replaceLocalValues(dstLclRow, lclDstColsView.getRawPtr(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1533 if (err != numEntries) {
1534 lclErr = true;
1535 }
1536 }
1537 }
1538 }
1539 }
1540
1541 if (lclErr) {
1542 std::ostream& err = this->markLocalErrorAndGetStream();
1543#ifdef HAVE_TPETRA_DEBUG
1544 err << "copyAndPermute: The graph structure of the source of the "
1545 "Import or Export must be a subset of the graph structure of the "
1546 "target. ";
1547 err << "invalidSrcCopyRows = [";
1548 for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin();
1549 it != invalidSrcCopyRows.end(); ++it) {
1550 err << *it;
1551 typename std::set<LO>::const_iterator itp1 = it;
1552 itp1++;
1553 if (itp1 != invalidSrcCopyRows.end()) {
1554 err << ",";
1555 }
1556 }
1557 err << "], invalidDstCopyRows = [";
1558 for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin();
1559 it != invalidDstCopyRows.end(); ++it) {
1560 err << *it;
1561 typename std::set<LO>::const_iterator itp1 = it;
1562 itp1++;
1563 if (itp1 != invalidDstCopyRows.end()) {
1564 err << ",";
1565 }
1566 }
1567 err << "], invalidDstCopyCols = [";
1568 for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin();
1569 it != invalidDstCopyCols.end(); ++it) {
1570 err << *it;
1571 typename std::set<LO>::const_iterator itp1 = it;
1572 itp1++;
1573 if (itp1 != invalidDstCopyCols.end()) {
1574 err << ",";
1575 }
1576 }
1577 err << "], invalidDstPermuteCols = [";
1578 for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin();
1579 it != invalidDstPermuteCols.end(); ++it) {
1580 err << *it;
1581 typename std::set<LO>::const_iterator itp1 = it;
1582 itp1++;
1583 if (itp1 != invalidDstPermuteCols.end()) {
1584 err << ",";
1585 }
1586 }
1587 err << "], invalidPermuteFromRows = [";
1588 for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin();
1589 it != invalidPermuteFromRows.end(); ++it) {
1590 err << *it;
1591 typename std::set<LO>::const_iterator itp1 = it;
1592 itp1++;
1593 if (itp1 != invalidPermuteFromRows.end()) {
1594 err << ",";
1595 }
1596 }
1597 err << "]" << endl;
1598#else
1599 err << "copyAndPermute: The graph structure of the source of the "
1600 "Import or Export must be a subset of the graph structure of the "
1601 "target."
1602 << endl;
1603#endif // HAVE_TPETRA_DEBUG
1604 }
1605
1606 if (debug) {
1607 std::ostringstream os;
1608 const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1609 const bool lclSuccess = !(*(this->localError_));
1610 os << "*** Proc " << myRank << ": copyAndPermute "
1611 << (lclSuccess ? "succeeded" : "FAILED");
1612 if (lclSuccess) {
1613 os << endl;
1614 } else {
1615 os << ": error messages: " << this->errorMessages(); // comes w/ endl
1616 }
1617 std::cerr << os.str();
1618 }
1619}
1620
1621namespace { // (anonymous)
1622
1641template <class LO, class GO>
1642size_t
1643packRowCount(const size_t numEnt,
1644 const size_t numBytesPerValue,
1645 const size_t blkSize) {
1646 using ::Tpetra::Details::PackTraits;
1647
1648 if (numEnt == 0) {
1649 // Empty rows always take zero bytes, to ensure sparsity.
1650 return 0;
1651 } else {
1652 // We store the number of entries as a local index (LO).
1653 LO numEntLO = 0; // packValueCount wants this.
1654 GO gid{};
1655 const size_t numEntLen = PackTraits<LO>::packValueCount(numEntLO);
1656 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount(gid);
1657 const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
1658 return numEntLen + gidsLen + valsLen;
1659 }
1660}
1661
1672template <class ST, class LO, class GO>
1673size_t
1674unpackRowCount(const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
1675 const size_t offset,
1676 const size_t numBytes,
1677 const size_t /* numBytesPerValue */) {
1678 using Kokkos::subview;
1679 using ::Tpetra::Details::PackTraits;
1680
1681 if (numBytes == 0) {
1682 // Empty rows always take zero bytes, to ensure sparsity.
1683 return static_cast<size_t>(0);
1684 } else {
1685 LO numEntLO = 0;
1686 const size_t theNumBytes = PackTraits<LO>::packValueCount(numEntLO);
1687 TEUCHOS_TEST_FOR_EXCEPTION(theNumBytes > numBytes, std::logic_error,
1688 "unpackRowCount: "
1689 "theNumBytes = "
1690 << theNumBytes << " < numBytes = " << numBytes
1691 << ".");
1692 const char* const inBuf = imports.data() + offset;
1693 const size_t actualNumBytes = PackTraits<LO>::unpackValue(numEntLO, inBuf);
1694 TEUCHOS_TEST_FOR_EXCEPTION(actualNumBytes > numBytes, std::logic_error,
1695 "unpackRowCount: "
1696 "actualNumBytes = "
1697 << actualNumBytes << " < numBytes = " << numBytes
1698 << ".");
1699 return static_cast<size_t>(numEntLO);
1700 }
1701}
1702
1706template <class ST, class LO, class GO>
1707size_t
1708packRowForBlockCrs(const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
1709 const size_t offset,
1710 const size_t numEnt,
1711 const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
1712 const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
1713 const size_t numBytesPerValue,
1714 const size_t blockSize) {
1715 using Kokkos::subview;
1716 using ::Tpetra::Details::PackTraits;
1717
1718 if (numEnt == 0) {
1719 // Empty rows always take zero bytes, to ensure sparsity.
1720 return 0;
1721 }
1722 const size_t numScalarEnt = numEnt * blockSize * blockSize;
1723
1724 const GO gid = 0; // packValueCount wants this
1725 const LO numEntLO = static_cast<size_t>(numEnt);
1726
1727 const size_t numEntBeg = offset;
1728 const size_t numEntLen = PackTraits<LO>::packValueCount(numEntLO);
1729 const size_t gidsBeg = numEntBeg + numEntLen;
1730 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount(gid);
1731 const size_t valsBeg = gidsBeg + gidsLen;
1732 const size_t valsLen = numScalarEnt * numBytesPerValue;
1733
1734 char* const numEntOut = exports.data() + numEntBeg;
1735 char* const gidsOut = exports.data() + gidsBeg;
1736 char* const valsOut = exports.data() + valsBeg;
1737
1738 size_t numBytesOut = 0;
1739 int errorCode = 0;
1740 numBytesOut += PackTraits<LO>::packValue(numEntOut, numEntLO);
1741
1742 {
1743 Kokkos::pair<int, size_t> p;
1744 p = PackTraits<GO>::packArray(gidsOut, gidsIn.data(), numEnt);
1745 errorCode += p.first;
1746 numBytesOut += p.second;
1747
1748 p = PackTraits<ST>::packArray(valsOut, valsIn.data(), numScalarEnt);
1749 errorCode += p.first;
1750 numBytesOut += p.second;
1751 }
1752
1753 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1754 TEUCHOS_TEST_FOR_EXCEPTION(numBytesOut != expectedNumBytes, std::logic_error,
1755 "packRowForBlockCrs: numBytesOut = " << numBytesOut
1756 << " != expectedNumBytes = " << expectedNumBytes << ".");
1757
1758 TEUCHOS_TEST_FOR_EXCEPTION(errorCode != 0, std::runtime_error,
1759 "packRowForBlockCrs: "
1760 "PackTraits::packArray returned a nonzero error code");
1761
1762 return numBytesOut;
1763}
1764
1765// Return the number of bytes actually read / used.
1766template <class ST, class LO, class GO>
1767size_t
1768unpackRowForBlockCrs(const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
1769 const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
1770 const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
1771 const size_t offset,
1772 const size_t numBytes,
1773 const size_t numEnt,
1774 const size_t numBytesPerValue,
1775 const size_t blockSize) {
1776 using ::Tpetra::Details::PackTraits;
1777
1778 if (numBytes == 0) {
1779 // Rows with zero bytes always have zero entries.
1780 return 0;
1781 }
1782 const size_t numScalarEnt = numEnt * blockSize * blockSize;
1783 TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(imports.extent(0)) <= offset,
1784 std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = " << imports.extent(0) << " <= offset = " << offset << ".");
1785 TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(imports.extent(0)) < offset + numBytes,
1786 std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = " << imports.extent(0) << " < offset + numBytes = " << (offset + numBytes) << ".");
1787
1788 const GO gid = 0; // packValueCount wants this
1789 const LO lid = 0; // packValueCount wants this
1790
1791 const size_t numEntBeg = offset;
1792 const size_t numEntLen = PackTraits<LO>::packValueCount(lid);
1793 const size_t gidsBeg = numEntBeg + numEntLen;
1794 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount(gid);
1795 const size_t valsBeg = gidsBeg + gidsLen;
1796 const size_t valsLen = numScalarEnt * numBytesPerValue;
1797
1798 const char* const numEntIn = imports.data() + numEntBeg;
1799 const char* const gidsIn = imports.data() + gidsBeg;
1800 const char* const valsIn = imports.data() + valsBeg;
1801
1802 size_t numBytesOut = 0;
1803 int errorCode = 0;
1804 LO numEntOut;
1805 numBytesOut += PackTraits<LO>::unpackValue(numEntOut, numEntIn);
1806 TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(numEntOut) != numEnt, std::logic_error,
1807 "unpackRowForBlockCrs: Expected number of entries " << numEnt
1808 << " != actual number of entries " << numEntOut << ".");
1809
1810 {
1811 Kokkos::pair<int, size_t> p;
1812 p = PackTraits<GO>::unpackArray(gidsOut.data(), gidsIn, numEnt);
1813 errorCode += p.first;
1814 numBytesOut += p.second;
1815
1816 p = PackTraits<ST>::unpackArray(valsOut.data(), valsIn, numScalarEnt);
1817 errorCode += p.first;
1818 numBytesOut += p.second;
1819 }
1820
1821 TEUCHOS_TEST_FOR_EXCEPTION(numBytesOut != numBytes, std::logic_error,
1822 "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1823 << " != numBytes = " << numBytes << ".");
1824
1825 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1826 TEUCHOS_TEST_FOR_EXCEPTION(numBytesOut != expectedNumBytes, std::logic_error,
1827 "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1828 << " != expectedNumBytes = " << expectedNumBytes << ".");
1829
1830 TEUCHOS_TEST_FOR_EXCEPTION(errorCode != 0, std::runtime_error,
1831 "unpackRowForBlockCrs: "
1832 "PackTraits::unpackArray returned a nonzero error code");
1833
1834 return numBytesOut;
1835}
1836} // namespace
1837
1838template <class Scalar, class LO, class GO, class Node>
1840 packAndPrepare(const ::Tpetra::SrcDistObject& source,
1841 const Kokkos::DualView<const local_ordinal_type*,
1843 Kokkos::DualView<packet_type*,
1844 buffer_device_type>& exports, // output
1845 Kokkos::DualView<size_t*,
1847 numPacketsPerLID, // output
1848 size_t& constantNumPackets) {
1849 using ::Tpetra::Details::Behavior;
1850 using ::Tpetra::Details::dualViewStatusToString;
1851 using ::Tpetra::Details::PackTraits;
1852 using ::Tpetra::Details::ProfilingRegion;
1853
1854 typedef typename Kokkos::View<int*, device_type>::host_mirror_type::execution_space host_exec;
1855
1857
1858 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
1859
1860 const bool debug = Behavior::debug();
1861 const bool verbose = Behavior::verbose();
1862
1863 // Define this function prefix
1864 std::string prefix;
1865 {
1866 std::ostringstream os;
1867 const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1868 os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
1869 prefix = os.str();
1870 }
1871
1872 // check if this already includes a local error
1873 if (*(this->localError_)) {
1874 std::ostream& err = this->markLocalErrorAndGetStream();
1875 err << prefix
1876 << "The target object of the Import or Export is already in an error state."
1877 << std::endl;
1878 return;
1879 }
1880
1881 //
1882 // Verbose input dual view status
1883 //
1884 if (verbose) {
1885 std::ostringstream os;
1886 os << prefix << std::endl
1887 << prefix << " " << dualViewStatusToString(exportLIDs, "exportLIDs") << std::endl
1888 << prefix << " " << dualViewStatusToString(exports, "exports") << std::endl
1889 << prefix << " " << dualViewStatusToString(numPacketsPerLID, "numPacketsPerLID") << std::endl;
1890 std::cerr << os.str();
1891 }
1892
1896 if (exportLIDs.extent(0) != numPacketsPerLID.extent(0)) {
1897 std::ostream& err = this->markLocalErrorAndGetStream();
1898 err << prefix
1899 << "exportLIDs.extent(0) = " << exportLIDs.extent(0)
1900 << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
1901 << "." << std::endl;
1902 return;
1903 }
1904 if (exportLIDs.need_sync_host()) {
1905 std::ostream& err = this->markLocalErrorAndGetStream();
1906 err << prefix << "exportLIDs be sync'd to host." << std::endl;
1907 return;
1908 }
1909
1910 const this_BCRS_type* src = dynamic_cast<const this_BCRS_type*>(&source);
1911 if (src == NULL) {
1912 std::ostream& err = this->markLocalErrorAndGetStream();
1913 err << prefix
1914 << "The source (input) object of the Import or "
1915 "Export is either not a BlockCrsMatrix, or does not have the right "
1916 "template parameters. checkSizes() should have caught this. "
1917 "Please report this bug to the Tpetra developers."
1918 << std::endl;
1919 return;
1920 }
1921
1922 // Graphs and matrices are allowed to have a variable number of
1923 // entries per row. We could test whether all rows have the same
1924 // number of entries, but DistObject can only use this
1925 // optimization if all rows on _all_ processes have the same
1926 // number of entries. Rather than do the all-reduce necessary to
1927 // test for this unlikely case, we tell DistObject (by setting
1928 // constantNumPackets to zero) to assume that different rows may
1929 // have different numbers of entries.
1931
1932 // const values
1933 const crs_graph_type& srcGraph = src->graph_;
1934 const size_t blockSize = static_cast<size_t>(src->getBlockSize());
1935 const size_t numExportLIDs = exportLIDs.extent(0);
1936 size_t numBytesPerValue(0);
1937 {
1938 auto val_host = val_.getHostView(Access::ReadOnly);
1940 PackTraits<impl_scalar_type>::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
1941 }
1942
1943 // Compute the number of bytes ("packets") per row to pack. While
1944 // we're at it, compute the total # of block entries to send, and
1945 // the max # of block entries in any of the rows we're sending.
1946
1947 Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
1948
1949 // Graph information is on host; let's do this on host parallel reduce
1950 auto exportLIDsHost = exportLIDs.view_host();
1951 auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
1952 numPacketsPerLID.modify_host();
1953 {
1954 rowReducerStruct = Impl::BlockCrsRowStruct<size_t>();
1955 for (size_t i = 0; i < numExportLIDs; ++i) {
1956 const LO lclRow = exportLIDsHost(i);
1957 size_t numEnt = srcGraph.getNumEntriesInLocalRow(lclRow);
1958 numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid() ? 0 : numEnt);
1959
1960 const size_t numBytes =
1963 rowReducerStruct += Impl::BlockCrsRowStruct<size_t>(numEnt, numBytes, numEnt);
1964 }
1965 }
1966
1967 // Compute the number of bytes ("packets") per row to pack. While
1968 // we're at it, compute the total # of block entries to send, and
1969 // the max # of block entries in any of the rows we're sending.
1970 const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
1971 const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
1972 const size_t maxRowLength = rowReducerStruct.maxRowLength;
1973
1974 if (verbose) {
1975 std::ostringstream os;
1976 os << prefix
1977 << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
1978 << std::endl;
1979 std::cerr << os.str();
1980 }
1981
1982 // We use a "struct of arrays" approach to packing each row's
1983 // entries. All the column indices (as global indices) go first,
1984 // then all their owning process ranks, and then the values.
1985 if (exports.extent(0) != totalNumBytes) {
1986 const std::string oldLabel = exports.view_device().label();
1987 const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
1988 exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
1989 }
1990 if (totalNumEntries > 0) {
1991 // Current position (in bytes) in the 'exports' output array.
1992 Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs + 1);
1993 {
1994 const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs + 1);
1995 Kokkos::parallel_scan(policy,
1996 [=](const size_t& i, size_t& update, const bool& final) {
1997 if (final) offset(i) = update;
1998 update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
1999 });
2000 }
2001 if (offset(numExportLIDs) != totalNumBytes) {
2002 std::ostream& err = this->markLocalErrorAndGetStream();
2003 err << prefix
2004 << "At end of method, the final offset (in bytes) "
2005 << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2006 << totalNumBytes << ". "
2007 << "Please report this bug to the Tpetra developers." << std::endl;
2008 return;
2009 }
2010
2011 // For each block row of the matrix owned by the calling
2012 // process, pack that block row's column indices and values into
2013 // the exports array.
2014
2015 // Source matrix's column Map. We verified in checkSizes() that
2016 // the column Map exists (is not null).
2017 const map_type& srcColMap = *(srcGraph.getColMap());
2018
2019 // Pack the data for each row to send, into the 'exports' buffer.
2020 // exports will be modified on host.
2021 exports.modify_host();
2022 {
2023 typedef Kokkos::TeamPolicy<host_exec> policy_type;
2024 const auto policy =
2025 policy_type(numExportLIDs, 1, 1)
2026 .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO) * maxRowLength));
2027 // The following parallel_for needs const access to the local values of src.
2028 // (the local graph is also accessed on host, but this does not use WDVs).
2029 getValuesHost();
2031 Kokkos::parallel_for(policy,
2032 [=](const typename policy_type::member_type& member) {
2033 const size_t i = member.league_rank();
2034 Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2035 gblColInds(member.team_scratch(0), maxRowLength);
2036
2037 const LO lclRowInd = exportLIDsHost(i);
2038 local_inds_host_view_type lclColInds;
2039 values_host_view_type vals;
2040
2041 // It's OK to ignore the return value, since if the calling
2042 // process doesn't own that local row, then the number of
2043 // entries in that row on the calling process is zero.
2044 src->getLocalRowView(lclRowInd, lclColInds, vals);
2045 const size_t numEnt = lclColInds.extent(0);
2046
2047 // Convert column indices from local to global.
2048 for (size_t j = 0; j < numEnt; ++j)
2049 gblColInds(j) = srcColMap.getGlobalElement(lclColInds(j));
2050
2051 // Kyungjoo: additional wrapping scratch view is necessary
2052 // the following function interface need the same execution space
2053 // host scratch space somehow is not considered same as the host_exec
2054 // Copy the row's data into the current spot in the exports array.
2055 const size_t numBytes =
2057 offset(i),
2058 numEnt,
2059 Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2060 vals,
2062 blockSize);
2063
2064 // numBytes should be same as the difference between offsets
2065 if (debug) {
2066 const size_t offsetDiff = offset(i + 1) - offset(i);
2067 if (numBytes != offsetDiff) {
2068 std::ostringstream os;
2069 os << prefix
2070 << "numBytes computed from packRowForBlockCrs is different from "
2071 << "precomputed offset values, LID = " << i << std::endl;
2072 std::cerr << os.str();
2073 }
2074 }
2075 }); // for each LID (of a row) to send
2077 }
2078 } // if totalNumEntries > 0
2079
2080 if (debug) {
2081 std::ostringstream os;
2082 const bool lclSuccess = !(*(this->localError_));
2083 os << prefix
2084 << (lclSuccess ? "succeeded" : "FAILED")
2085 << std::endl;
2086 std::cerr << os.str();
2087 }
2088}
2089
2090template <class Scalar, class LO, class GO, class Node>
2092 unpackAndCombine(const Kokkos::DualView<const local_ordinal_type*,
2094 Kokkos::DualView<packet_type*,
2096 imports,
2097 Kokkos::DualView<size_t*,
2100 const size_t /* constantNumPackets */,
2101 const CombineMode combineMode) {
2102 using std::endl;
2103 using ::Tpetra::Details::Behavior;
2104 using ::Tpetra::Details::dualViewStatusToString;
2105 using ::Tpetra::Details::PackTraits;
2106 using ::Tpetra::Details::ProfilingRegion;
2107 using host_exec =
2108 typename Kokkos::View<int*, device_type>::host_mirror_type::execution_space;
2109
2110 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
2111 const bool verbose = Behavior::verbose();
2112
2113 // Define this function prefix
2114 std::string prefix;
2115 {
2116 std::ostringstream os;
2117 auto map = this->graph_.getRowMap();
2118 auto comm = map.is_null() ? Teuchos::null : map->getComm();
2119 const int myRank = comm.is_null() ? -1 : comm->getRank();
2120 os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
2121 prefix = os.str();
2122 if (verbose) {
2123 os << "Start" << endl;
2124 std::cerr << os.str();
2125 }
2126 }
2127
2128 // check if this already includes a local error
2129 if (*(this->localError_)) {
2130 std::ostream& err = this->markLocalErrorAndGetStream();
2131 std::ostringstream os;
2132 os << prefix << "{Im/Ex}port target is already in error." << endl;
2133 if (verbose) {
2134 std::cerr << os.str();
2135 }
2136 err << os.str();
2137 return;
2138 }
2139
2143 if (importLIDs.extent(0) != numPacketsPerLID.extent(0)) {
2144 std::ostream& err = this->markLocalErrorAndGetStream();
2145 std::ostringstream os;
2146 os << prefix << "importLIDs.extent(0) = " << importLIDs.extent(0)
2147 << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2148 << "." << endl;
2149 if (verbose) {
2150 std::cerr << os.str();
2151 }
2152 err << os.str();
2153 return;
2154 }
2155
2156 if (combineMode != ADD && combineMode != INSERT &&
2158 combineMode != ZERO) {
2159 std::ostream& err = this->markLocalErrorAndGetStream();
2160 std::ostringstream os;
2161 os << prefix
2162 << "Invalid CombineMode value " << combineMode << ". Valid "
2163 << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2164 << std::endl;
2165 if (verbose) {
2166 std::cerr << os.str();
2167 }
2168 err << os.str();
2169 return;
2170 }
2171
2172 if (this->graph_.getColMap().is_null()) {
2173 std::ostream& err = this->markLocalErrorAndGetStream();
2174 std::ostringstream os;
2175 os << prefix << "Target matrix's column Map is null." << endl;
2176 if (verbose) {
2177 std::cerr << os.str();
2178 }
2179 err << os.str();
2180 return;
2181 }
2182
2183 // Target matrix's column Map. Use to convert the global column
2184 // indices in the receive buffer to local indices. We verified in
2185 // checkSizes() that the column Map exists (is not null).
2186 const map_type& tgtColMap = *(this->graph_.getColMap());
2187
2188 // Const values
2189 const size_t blockSize = this->getBlockSize();
2190 const size_t numImportLIDs = importLIDs.extent(0);
2191 // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
2192 // default-constructed instance could have a different size than
2193 // other instances. (We assume all nominally constructed
2194 // instances have the same size; that's not the issue here.) This
2195 // could be bad if the calling process has no entries, but other
2196 // processes have entries that they want to send to us.
2197 size_t numBytesPerValue(0);
2198 {
2199 auto val_host = val_.getHostView(Access::ReadOnly);
2201 PackTraits<impl_scalar_type>::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
2202 }
2203 const size_t maxRowNumEnt = graph_.getLocalMaxNumRowEntries();
2205
2206 if (verbose) {
2207 std::ostringstream os;
2208 os << prefix << "combineMode: "
2210 << ", blockSize: " << blockSize
2211 << ", numImportLIDs: " << numImportLIDs
2212 << ", numBytesPerValue: " << numBytesPerValue
2213 << ", maxRowNumEnt: " << maxRowNumEnt
2214 << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
2215 std::cerr << os.str();
2216 }
2217
2218 if (combineMode == ZERO || numImportLIDs == 0) {
2219 if (verbose) {
2220 std::ostringstream os;
2221 os << prefix << "Nothing to unpack. Done!" << std::endl;
2222 std::cerr << os.str();
2223 }
2224 return;
2225 }
2226
2227 // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
2228 // we can remove this sync.
2229 if (imports.need_sync_host()) {
2230 imports.sync_host();
2231 }
2232
2233 // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
2234 // sync'd numPacketsPerLID to host, since it needs to do that in
2235 // order to post MPI_Irecv messages with the correct lengths. A
2236 // hypothetical device-specific boundary exchange implementation
2237 // could possibly receive data without sync'ing lengths to host,
2238 // but we don't need to design for that nonexistent thing yet.
2239
2240 if (imports.need_sync_host() || numPacketsPerLID.need_sync_host() ||
2241 importLIDs.need_sync_host()) {
2242 std::ostream& err = this->markLocalErrorAndGetStream();
2243 std::ostringstream os;
2244 os << prefix << "All input DualViews must be sync'd to host by now. "
2245 << "imports_nc.need_sync_host()="
2246 << (imports.need_sync_host() ? "true" : "false")
2247 << ", numPacketsPerLID.need_sync_host()="
2248 << (numPacketsPerLID.need_sync_host() ? "true" : "false")
2249 << ", importLIDs.need_sync_host()="
2250 << (importLIDs.need_sync_host() ? "true" : "false")
2251 << "." << endl;
2252 if (verbose) {
2253 std::cerr << os.str();
2254 }
2255 err << os.str();
2256 return;
2257 }
2258
2259 const auto importLIDsHost = importLIDs.view_host();
2260 const auto numPacketsPerLIDHost = numPacketsPerLID.view_host();
2261
2262 // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
2263 // loop, by only unpacking on final==true (when we know the
2264 // current offset's value).
2265
2266 Kokkos::View<size_t*, host_exec> offset("offset", numImportLIDs + 1);
2267 {
2268 const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs + 1);
2269 Kokkos::parallel_scan("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
2270 [=](const size_t& i, size_t& update, const bool& final) {
2271 if (final) offset(i) = update;
2272 update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
2273 });
2274 }
2275
2276 // this variable does not matter with a race condition (just error flag)
2277 //
2278 // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
2279 // or atomics on bool, so we use int instead. (I know we're not
2280 // launching a CUDA loop here, but we could in the future, and I'd
2281 // like to avoid potential trouble.)
2282 Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
2283 errorDuringUnpack("errorDuringUnpack");
2284 errorDuringUnpack() = 0;
2285 {
2286 using policy_type = Kokkos::TeamPolicy<host_exec>;
2287 size_t scratch_per_row = sizeof(GO) * maxRowNumEnt + sizeof(LO) * maxRowNumEnt + numBytesPerValue * maxRowNumScalarEnt + 2 * sizeof(GO); // Yeah, this is a fudge factor
2288
2289 const auto policy = policy_type(numImportLIDs, 1, 1)
2290 .set_scratch_size(0, Kokkos::PerTeam(scratch_per_row));
2291 using host_scratch_space = typename host_exec::scratch_memory_space;
2292
2293 using pair_type = Kokkos::pair<size_t, size_t>;
2294
2295 // The following parallel_for modifies values on host while unpacking.
2296 getValuesHostNonConst();
2298 Kokkos::parallel_for("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
2299 [=, *this](const typename policy_type::member_type& member) {
2300 const size_t i = member.league_rank();
2301 Kokkos::View<GO*, host_scratch_space> gblColInds(member.team_scratch(0), maxRowNumEnt);
2302 Kokkos::View<LO*, host_scratch_space> lclColInds(member.team_scratch(0), maxRowNumEnt);
2303 Kokkos::View<impl_scalar_type*, host_scratch_space> vals(member.team_scratch(0), maxRowNumScalarEnt);
2304
2305 const size_t offval = offset(i);
2306 const LO lclRow = importLIDsHost(i);
2307 const size_t numBytes = numPacketsPerLIDHost(i);
2308 const size_t numEnt =
2310
2311 if (numBytes > 0) {
2312 if (numEnt > maxRowNumEnt) {
2313 errorDuringUnpack() = 1;
2314 if (verbose) {
2315 std::ostringstream os;
2316 os << prefix
2317 << "At i = " << i << ", numEnt = " << numEnt
2318 << " > maxRowNumEnt = " << maxRowNumEnt
2319 << std::endl;
2320 std::cerr << os.str();
2321 }
2322 return;
2323 }
2324 }
2325 const size_t numScalarEnt = numEnt * blockSize * blockSize;
2326 auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
2327 auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
2328 auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
2329
2330 // Kyungjoo: additional wrapping scratch view is necessary
2331 // the following function interface need the same execution space
2332 // host scratch space somehow is not considered same as the host_exec
2333 size_t numBytesOut = 0;
2334 try {
2335 numBytesOut =
2336 unpackRowForBlockCrs<impl_scalar_type, LO, GO>(Kokkos::View<GO*, host_exec>(gidsOut.data(), numEnt),
2337 Kokkos::View<impl_scalar_type*, host_exec>(valsOut.data(), numScalarEnt),
2338 imports.view_host(),
2341 } catch (std::exception& e) {
2342 errorDuringUnpack() = 1;
2343 if (verbose) {
2344 std::ostringstream os;
2345 os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
2346 << e.what() << endl;
2347 std::cerr << os.str();
2348 }
2349 return;
2350 }
2351
2352 if (numBytes != numBytesOut) {
2353 errorDuringUnpack() = 1;
2354 if (verbose) {
2355 std::ostringstream os;
2356 os << prefix
2357 << "At i = " << i << ", numBytes = " << numBytes
2358 << " != numBytesOut = " << numBytesOut << "."
2359 << std::endl;
2360 std::cerr << os.str();
2361 }
2362 return;
2363 }
2364
2365 // Convert incoming global indices to local indices.
2366 for (size_t k = 0; k < numEnt; ++k) {
2367 lidsOut(k) = tgtColMap.getLocalElement(gidsOut(k));
2368 if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid()) {
2369 if (verbose) {
2370 std::ostringstream os;
2371 os << prefix
2372 << "At i = " << i << ", GID " << gidsOut(k)
2373 << " is not owned by the calling process."
2374 << std::endl;
2375 std::cerr << os.str();
2376 }
2377 return;
2378 }
2379 }
2380
2381 // Combine the incoming data with the matrix's current data.
2382 LO numCombd = 0;
2383 const LO* const lidsRaw = const_cast<const LO*>(lidsOut.data());
2384 const Scalar* const valsRaw = reinterpret_cast<const Scalar*>(const_cast<const impl_scalar_type*>(valsOut.data()));
2385 if (combineMode == ADD) {
2386 numCombd = this->sumIntoLocalValues(lclRow, lidsRaw, valsRaw, numEnt);
2387 } else if (combineMode == INSERT || combineMode == REPLACE) {
2388 numCombd = this->replaceLocalValues(lclRow, lidsRaw, valsRaw, numEnt);
2389 } else if (combineMode == ABSMAX) {
2390 numCombd = this->absMaxLocalValues(lclRow, lidsRaw, valsRaw, numEnt);
2391 }
2392
2393 if (static_cast<LO>(numEnt) != numCombd) {
2394 errorDuringUnpack() = 1;
2395 if (verbose) {
2396 std::ostringstream os;
2397 os << prefix << "At i = " << i << ", numEnt = " << numEnt
2398 << " != numCombd = " << numCombd << "."
2399 << endl;
2400 std::cerr << os.str();
2401 }
2402 return;
2403 }
2404 }); // for each import LID i
2406 }
2407
2408 if (errorDuringUnpack() != 0) {
2409 std::ostream& err = this->markLocalErrorAndGetStream();
2410 err << prefix << "Unpacking failed.";
2411 if (!verbose) {
2412 err << " Please run again with the environment variable setting "
2413 "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
2414 }
2415 err << endl;
2416 }
2417
2418 if (verbose) {
2419 std::ostringstream os;
2420 const bool lclSuccess = !(*(this->localError_));
2421 os << prefix
2422 << (lclSuccess ? "succeeded" : "FAILED")
2423 << std::endl;
2424 std::cerr << os.str();
2425 }
2426}
2427
2428template <class Scalar, class LO, class GO, class Node>
2429std::string
2431 using Teuchos::TypeNameTraits;
2432 std::ostringstream os;
2433 os << "\"Tpetra::BlockCrsMatrix\": { "
2434 << "Template parameters: { "
2435 << "Scalar: " << TypeNameTraits<Scalar>::name()
2436 << "LO: " << TypeNameTraits<LO>::name()
2437 << "GO: " << TypeNameTraits<GO>::name()
2438 << "Node: " << TypeNameTraits<Node>::name()
2439 << " }"
2440 << ", Label: \"" << this->getObjectLabel() << "\""
2441 << ", Global dimensions: ["
2442 << graph_.getDomainMap()->getGlobalNumElements() << ", "
2443 << graph_.getRangeMap()->getGlobalNumElements() << "]"
2444 << ", Block size: " << getBlockSize()
2445 << " }";
2446 return os.str();
2447}
2448
2449template <class Scalar, class LO, class GO, class Node>
2451 describe(Teuchos::FancyOStream& out,
2452 const Teuchos::EVerbosityLevel verbLevel) const {
2453 using Teuchos::ArrayRCP;
2454 using Teuchos::CommRequest;
2455 using Teuchos::FancyOStream;
2456 using Teuchos::getFancyOStream;
2457 using Teuchos::ireceive;
2458 using Teuchos::isend;
2459 using Teuchos::outArg;
2460 using Teuchos::TypeNameTraits;
2461 using Teuchos::VERB_DEFAULT;
2462 using Teuchos::VERB_LOW;
2463 using Teuchos::VERB_MEDIUM;
2464 using Teuchos::VERB_NONE;
2465 // using Teuchos::VERB_HIGH;
2466 using std::endl;
2467 using Teuchos::RCP;
2468 using Teuchos::VERB_EXTREME;
2469 using Teuchos::wait;
2470
2471 // Set default verbosity if applicable.
2472 const Teuchos::EVerbosityLevel vl =
2474
2475 if (vl == VERB_NONE) {
2476 return; // print nothing
2477 }
2478
2479 // describe() always starts with a tab before it prints anything.
2480 Teuchos::OSTab tab0(out);
2481
2482 out << "\"Tpetra::BlockCrsMatrix\":" << endl;
2483 Teuchos::OSTab tab1(out);
2484
2485 out << "Template parameters:" << endl;
2486 {
2487 Teuchos::OSTab tab2(out);
2488 out << "Scalar: " << TypeNameTraits<Scalar>::name() << endl
2489 << "LO: " << TypeNameTraits<LO>::name() << endl
2490 << "GO: " << TypeNameTraits<GO>::name() << endl
2491 << "Node: " << TypeNameTraits<Node>::name() << endl;
2492 }
2493 out << "Label: \"" << this->getObjectLabel() << "\"" << endl
2494 << "Global dimensions: ["
2495 << graph_.getDomainMap()->getGlobalNumElements() << ", "
2496 << graph_.getRangeMap()->getGlobalNumElements() << "]" << endl;
2497
2498 const LO blockSize = getBlockSize();
2499 out << "Block size: " << blockSize << endl;
2500
2501 // constituent objects
2502 if (vl >= VERB_MEDIUM) {
2503 const Teuchos::Comm<int>& comm = *(graph_.getMap()->getComm());
2504 const int myRank = comm.getRank();
2505 if (myRank == 0) {
2506 out << "Row Map:" << endl;
2507 }
2508 getRowMap()->describe(out, vl);
2509
2510 if (!getColMap().is_null()) {
2511 if (getColMap() == getRowMap()) {
2512 if (myRank == 0) {
2513 out << "Column Map: same as row Map" << endl;
2514 }
2515 } else {
2516 if (myRank == 0) {
2517 out << "Column Map:" << endl;
2518 }
2519 getColMap()->describe(out, vl);
2520 }
2521 }
2522 if (!getDomainMap().is_null()) {
2523 if (getDomainMap() == getRowMap()) {
2524 if (myRank == 0) {
2525 out << "Domain Map: same as row Map" << endl;
2526 }
2527 } else if (getDomainMap() == getColMap()) {
2528 if (myRank == 0) {
2529 out << "Domain Map: same as column Map" << endl;
2530 }
2531 } else {
2532 if (myRank == 0) {
2533 out << "Domain Map:" << endl;
2534 }
2535 getDomainMap()->describe(out, vl);
2536 }
2537 }
2538 if (!getRangeMap().is_null()) {
2539 if (getRangeMap() == getDomainMap()) {
2540 if (myRank == 0) {
2541 out << "Range Map: same as domain Map" << endl;
2542 }
2543 } else if (getRangeMap() == getRowMap()) {
2544 if (myRank == 0) {
2545 out << "Range Map: same as row Map" << endl;
2546 }
2547 } else {
2548 if (myRank == 0) {
2549 out << "Range Map: " << endl;
2550 }
2551 getRangeMap()->describe(out, vl);
2552 }
2553 }
2554 }
2555
2556 if (vl >= VERB_EXTREME) {
2557 const Teuchos::Comm<int>& comm = *(graph_.getMap()->getComm());
2558 const int myRank = comm.getRank();
2559 const int numProcs = comm.getSize();
2560
2561 // Print the calling process' data to the given output stream.
2562 RCP<std::ostringstream> lclOutStrPtr(new std::ostringstream());
2564 FancyOStream& os = *osPtr;
2565 os << "Process " << myRank << ":" << endl;
2566 Teuchos::OSTab tab2(os);
2567
2568 const map_type& meshRowMap = *(graph_.getRowMap());
2569 const map_type& meshColMap = *(graph_.getColMap());
2570 for (LO meshLclRow = meshRowMap.getMinLocalIndex();
2571 meshLclRow <= meshRowMap.getMaxLocalIndex();
2572 ++meshLclRow) {
2573 const GO meshGblRow = meshRowMap.getGlobalElement(meshLclRow);
2574 os << "Row " << meshGblRow << ": {";
2575
2576 local_inds_host_view_type lclColInds;
2577 values_host_view_type vals;
2578 LO numInds = 0;
2579 this->getLocalRowView(meshLclRow, lclColInds, vals);
2580 numInds = lclColInds.extent(0);
2581
2582 for (LO k = 0; k < numInds; ++k) {
2583 const GO gblCol = meshColMap.getGlobalElement(lclColInds[k]);
2584
2585 os << "Col " << gblCol << ": [";
2586 for (LO i = 0; i < blockSize; ++i) {
2587 for (LO j = 0; j < blockSize; ++j) {
2588 os << vals[blockSize * blockSize * k + i * blockSize + j];
2589 if (j + 1 < blockSize) {
2590 os << ", ";
2591 }
2592 }
2593 if (i + 1 < blockSize) {
2594 os << "; ";
2595 }
2596 }
2597 os << "]";
2598 if (k + 1 < numInds) {
2599 os << ", ";
2600 }
2601 }
2602 os << "}" << endl;
2603 }
2604
2605 // Print data on Process 0. This will automatically respect the
2606 // current indentation level.
2607 if (myRank == 0) {
2608 out << lclOutStrPtr->str();
2609 lclOutStrPtr = Teuchos::null; // clear it to save space
2610 }
2611
2612 const int sizeTag = 1337;
2613 const int dataTag = 1338;
2614
2615 ArrayRCP<char> recvDataBuf; // only used on Process 0
2616
2617 // Send string sizes and data from each process in turn to
2618 // Process 0, and print on that process.
2619 for (int p = 1; p < numProcs; ++p) {
2620 if (myRank == 0) {
2621 // Receive the incoming string's length.
2623 recvSize[0] = 0;
2627 const size_t numCharsToRecv = recvSize[0];
2628
2629 // Allocate space for the string to receive. Reuse receive
2630 // buffer space if possible. We can do this because in the
2631 // current implementation, we only have one receive in
2632 // flight at a time. Leave space for the '\0' at the end,
2633 // in case the sender doesn't send it.
2634 if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
2635 recvDataBuf.resize(numCharsToRecv + 1);
2636 }
2638 // Post the receive of the actual string data.
2642
2643 // Print the received data. This will respect the current
2644 // indentation level. Make sure that the string is
2645 // null-terminated.
2647 out << recvDataBuf.getRawPtr();
2648 } else if (myRank == p) { // if I am not Process 0, and my rank is p
2649 // This deep-copies the string at most twice, depending on
2650 // whether std::string reference counts internally (it
2651 // generally does, so this won't deep-copy at all).
2652 const std::string stringToSend = lclOutStrPtr->str();
2653 lclOutStrPtr = Teuchos::null; // clear original to save space
2654
2655 // Send the string's length to Process 0.
2656 const size_t numCharsToSend = stringToSend.size();
2662
2663 // Send the actual string to Process 0. We know that the
2664 // string has length > 0, so it's save to take the address
2665 // of the first entry. Make a nonowning ArrayRCP to hold
2666 // the string. Process 0 will add a null termination
2667 // character at the end of the string, after it receives the
2668 // message.
2673 }
2674 } // for each process rank p other than 0
2675 } // extreme verbosity level (print the whole matrix)
2676}
2677
2678template <class Scalar, class LO, class GO, class Node>
2679Teuchos::RCP<const Teuchos::Comm<int> >
2681 getComm() const {
2682 return graph_.getComm();
2683}
2684
2685template <class Scalar, class LO, class GO, class Node>
2688 getGlobalNumCols() const {
2689 return graph_.getGlobalNumCols();
2690}
2691
2692template <class Scalar, class LO, class GO, class Node>
2693size_t
2695 getLocalNumCols() const {
2696 return graph_.getLocalNumCols();
2697}
2698
2699template <class Scalar, class LO, class GO, class Node>
2701 getIndexBase() const {
2702 return graph_.getIndexBase();
2703}
2704
2705template <class Scalar, class LO, class GO, class Node>
2708 getGlobalNumEntries() const {
2709 return graph_.getGlobalNumEntries();
2710}
2711
2712template <class Scalar, class LO, class GO, class Node>
2713size_t
2715 getLocalNumEntries() const {
2716 return graph_.getLocalNumEntries();
2717}
2718
2719template <class Scalar, class LO, class GO, class Node>
2720size_t
2723 return graph_.getNumEntriesInGlobalRow(globalRow);
2724}
2725
2726template <class Scalar, class LO, class GO, class Node>
2727size_t
2730 return graph_.getGlobalMaxNumRowEntries();
2731}
2732
2733template <class Scalar, class LO, class GO, class Node>
2735 hasColMap() const {
2736 return graph_.hasColMap();
2737}
2738
2739template <class Scalar, class LO, class GO, class Node>
2741 isLocallyIndexed() const {
2742 return graph_.isLocallyIndexed();
2743}
2744
2745template <class Scalar, class LO, class GO, class Node>
2747 isGloballyIndexed() const {
2748 return graph_.isGloballyIndexed();
2749}
2750
2751template <class Scalar, class LO, class GO, class Node>
2753 isFillComplete() const {
2754 return graph_.isFillComplete();
2755}
2756
2757template <class Scalar, class LO, class GO, class Node>
2759 supportsRowViews() const {
2760 return false;
2761}
2762
2763template <class Scalar, class LO, class GO, class Node>
2765 getGlobalRowCopy(GO /*GlobalRow*/,
2766 nonconst_global_inds_host_view_type& /*Indices*/,
2767 nonconst_values_host_view_type& /*Values*/,
2768 size_t& /*NumEntries*/) const {
2770 true, std::logic_error,
2771 "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
2772 "This class doesn't support global matrix indexing.");
2773}
2774
2775template <class Scalar, class LO, class GO, class Node>
2777 getGlobalRowView(GO /* GlobalRow */,
2778 global_inds_host_view_type& /* indices */,
2779 values_host_view_type& /* values */) const {
2781 true, std::logic_error,
2782 "Tpetra::BlockCrsMatrix::getGlobalRowView: "
2783 "This class doesn't support global matrix indexing.");
2784}
2785
2786template <class Scalar, class LO, class GO, class Node>
2789 local_inds_host_view_type& colInds,
2790 values_host_view_type& vals) const {
2791 if (!rowMeshMap_.isNodeLocalElement(localRowInd)) {
2792 colInds = local_inds_host_view_type();
2793 vals = values_host_view_type();
2794 } else {
2795 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2796 const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2797 colInds = local_inds_host_view_type(indHost_.data() + absBlockOffsetStart, numInds);
2798
2799 vals = getValuesHost(localRowInd);
2800 }
2801}
2802
2803template <class Scalar, class LO, class GO, class Node>
2806 local_inds_host_view_type& colInds,
2807 nonconst_values_host_view_type& vals) const {
2808 if (!rowMeshMap_.isNodeLocalElement(localRowInd)) {
2809 colInds = nonconst_local_inds_host_view_type();
2810 vals = nonconst_values_host_view_type();
2811 } else {
2812 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2813 const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2814 colInds = local_inds_host_view_type(indHost_.data() + absBlockOffsetStart, numInds);
2815
2817 vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
2818 }
2819}
2820
2821template <class Scalar, class LO, class GO, class Node>
2824 const size_t lclNumMeshRows = graph_.getLocalNumRows();
2825
2826 Kokkos::View<size_t*, device_type> diagOffsets("diagOffsets", lclNumMeshRows);
2827 graph_.getLocalDiagOffsets(diagOffsets);
2828
2829 // The code below works on host, so use a host View.
2830 auto diagOffsetsHost = Kokkos::create_mirror_view(diagOffsets);
2831 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
2832 Kokkos::deep_copy(execution_space(), diagOffsetsHost, diagOffsets);
2833
2834 auto vals_host_out = val_.getHostView(Access::ReadOnly);
2836
2837 // TODO amk: This is a temporary measure to make the code run with Ifpack2
2838 size_t rowOffset = 0;
2839 size_t offset = 0;
2840 LO bs = getBlockSize();
2841 for (size_t r = 0; r < getLocalNumRows(); r++) {
2842 // move pointer to start of diagonal block
2844 for (int b = 0; b < bs; b++) {
2845 diag.replaceLocalValue(r * bs + b, vals_host_out_raw[offset + b * (bs + 1)]);
2846 }
2847 // move pointer to start of next block row
2848 rowOffset += getNumEntriesInLocalRow(r) * bs * bs;
2849 }
2850
2851 // diag.template sync<memory_space> (); // sync vec of diag entries back to dev
2852}
2853
2854template <class Scalar, class LO, class GO, class Node>
2856 leftScale(const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */) {
2858 true, std::logic_error,
2859 "Tpetra::BlockCrsMatrix::leftScale: "
2860 "not implemented.");
2861}
2862
2863template <class Scalar, class LO, class GO, class Node>
2865 rightScale(const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */) {
2867 true, std::logic_error,
2868 "Tpetra::BlockCrsMatrix::rightScale: "
2869 "not implemented.");
2870}
2871
2872template <class Scalar, class LO, class GO, class Node>
2873Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
2875 getGraph() const {
2876 return graphRCP_;
2877}
2878
2879template <class Scalar, class LO, class GO, class Node>
2880typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
2882 getFrobeniusNorm() const {
2884 true, std::logic_error,
2885 "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
2886 "not implemented.");
2887}
2888
2889} // namespace Tpetra
2890
2891//
2892// Explicit instantiation macro
2893//
2894// Must be expanded from within the Tpetra namespace!
2895//
2896#define TPETRA_BLOCKCRSMATRIX_INSTANT(S, LO, GO, NODE) \
2897 template class BlockCrsMatrix<S, LO, GO, NODE>;
2898
2899#endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
Linear algebra kernels for small dense matrices and vectors.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh,...
void exportAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Export< LO, GO, Node > &exporter) const
Import from this to the given destination matrix, and make the result fill complete.
virtual bool isLocallyIndexed() const override
Whether matrix indices are locally indexed.
virtual bool isFillComplete() const override
Whether fillComplete() has been called.
virtual size_t getGlobalMaxNumRowEntries() const override
The maximum number of entries in any row over all processes in the matrix's communicator.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
virtual void getLocalRowCopy(LO LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Not implemented.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
virtual void getGlobalRowView(GO GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override
Print a description of this object to the given output stream.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
LO local_ordinal_type
The type of local indices.
virtual void getGlobalRowCopy(GO GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Get a copy of the given global row's entries.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
virtual bool hasColMap() const override
Whether this matrix has a well-defined column Map.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
size_t getLocalNumRows() const override
get the local number of block rows
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
impl_scalar_type_dualview::t_host getValuesHostNonConst() const
Get the host or device View of the matrix's values (val_).
Kokkos::View< impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const override
The Frobenius norm of the matrix.
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const override
Get the (mesh) graph.
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode) override
local_matrix_device_type getLocalMatrixDevice() const
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
std::string description() const override
One-line description of this object.
void importAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Import< LO, GO, Node > &importer) const
Import from this to the given destination matrix, and make the result fill complete.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
size_t getNumEntriesInLocalRow(const LO localRowInd) const override
Return the number of entries in the given row on the calling process.
virtual bool supportsRowViews() const override
Whether this object implements getLocalRowView() and getGlobalRowView().
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh,...
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const override
The current number of entries on the calling process in the specified global row.
Node::device_type device_type
The Kokkos::Device specialization that this class uses.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
Kokkos::View< const impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > const_little_block_type
The type used to access const matrix blocks.
void getLocalRowViewNonConst(LO LocalRow, local_inds_host_view_type &indices, nonconst_values_host_view_type &values) const
char packet_type
Implementation detail; tells.
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets) override
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the right with the given Vector x.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which this matrix is distributed.
global_size_t getGlobalNumRows() const override
get the global number of block rows
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the left with the given Vector x.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
virtual bool isGloballyIndexed() const override
Whether matrix indices are globally indexed.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
MultiVector for multiple degrees of freedom per mesh point.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Struct that holds views of the contents of a CrsMatrix.
One or more distributed dense vectors.
void disableWDVTracking()
Disable WrappedDualView reference-count tracking and syncing. Call this before entering a host-parall...
void enableWDVTracking()
Enable WrappedDualView reference-count tracking and syncing. Call this after exiting a host-parallel ...
Kokkos::LayoutRight BlockCrsMatrixLittleBlockArrayLayout
give an option to use layoutleft
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra's ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
@ ZERO
Replace old values with zero.