Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_MULTIVECTOR_DEF_HPP
11#define TPETRA_MULTIVECTOR_DEF_HPP
12
20
21#include "Tpetra_Core.hpp"
22#include "Tpetra_Util.hpp"
23#include "Tpetra_Vector.hpp"
35#include "Tpetra_Details_Random.hpp"
36#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
37#include "Teuchos_SerialDenseMatrix.hpp"
38#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
39#include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
40#include "KokkosCompat_View.hpp"
41#include "KokkosBlas.hpp"
42#include "KokkosKernels_Utils.hpp"
43#include "Kokkos_Random.hpp"
44#include "KokkosKernels_ArithTraits.hpp"
45#include <memory>
46#include <sstream>
47
48#ifdef HAVE_TPETRA_INST_FLOAT128
49namespace Kokkos {
50// FIXME (mfh 04 Sep 2015) Just a stub for now!
51template <class Generator>
52struct rand<Generator, __float128> {
53 static KOKKOS_INLINE_FUNCTION __float128 max() {
54 return static_cast<__float128>(1.0);
55 }
56 static KOKKOS_INLINE_FUNCTION __float128
57 draw(Generator& gen) {
58 // Half the smallest normalized double, is the scaling factor of
59 // the lower-order term in the double-double representation.
60 const __float128 scalingFactor =
61 static_cast<__float128>(std::numeric_limits<double>::min()) /
62 static_cast<__float128>(2.0);
63 const __float128 higherOrderTerm = static_cast<__float128>(gen.drand());
64 const __float128 lowerOrderTerm =
65 static_cast<__float128>(gen.drand()) * scalingFactor;
66 return higherOrderTerm + lowerOrderTerm;
67 }
68 static KOKKOS_INLINE_FUNCTION __float128
69 draw(Generator& gen, const __float128& range) {
70 // FIXME (mfh 05 Sep 2015) Not sure if this is right.
71 const __float128 scalingFactor =
72 static_cast<__float128>(std::numeric_limits<double>::min()) /
73 static_cast<__float128>(2.0);
74 const __float128 higherOrderTerm =
75 static_cast<__float128>(gen.drand(range));
76 const __float128 lowerOrderTerm =
77 static_cast<__float128>(gen.drand(range)) * scalingFactor;
78 return higherOrderTerm + lowerOrderTerm;
79 }
80 static KOKKOS_INLINE_FUNCTION __float128
81 draw(Generator& gen, const __float128& start, const __float128& end) {
82 // FIXME (mfh 05 Sep 2015) Not sure if this is right.
83 const __float128 scalingFactor =
84 static_cast<__float128>(std::numeric_limits<double>::min()) /
85 static_cast<__float128>(2.0);
86 const __float128 higherOrderTerm =
87 static_cast<__float128>(gen.drand(start, end));
88 const __float128 lowerOrderTerm =
89 static_cast<__float128>(gen.drand(start, end)) * scalingFactor;
90 return higherOrderTerm + lowerOrderTerm;
91 }
92};
93} // namespace Kokkos
94#endif // HAVE_TPETRA_INST_FLOAT128
95
96namespace { // (anonymous)
97
112template <class ST, class LO, class GO, class NT>
114allocDualView(const size_t lclNumRows,
115 const size_t numCols,
116 const bool zeroOut = true,
117 const bool allowPadding = false) {
118 using Kokkos::AllowPadding;
119 using Kokkos::view_alloc;
120 using Kokkos::WithoutInitializing;
121 using ::Tpetra::Details::Behavior;
122 typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
123 typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::wrapped_dual_view_type wrapped_dual_view_type;
124 typedef typename dual_view_type::t_dev dev_view_type;
125
126 // This needs to be a string and not a char*, if given as an
127 // argument to Kokkos::view_alloc. This is because view_alloc
128 // also allows a raw pointer as its first argument. See
129 // https://github.com/kokkos/kokkos/issues/434.
130 const std::string label("MV::DualView");
131 const bool debug = Behavior::debug();
132
133 // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
134 // creation of the DualView's Views works around
135 // Kokkos::DualView's current inability to accept an
136 // AllocationProperties initial argument (as Kokkos::View does).
137 // However, the work-around is harmless, since it does what the
138 // (currently nonexistent) equivalent DualView constructor would
139 // have done anyway.
140
141 dev_view_type d_view;
142 if (zeroOut) {
143 if (allowPadding) {
144 d_view = dev_view_type(view_alloc(label, AllowPadding),
145 lclNumRows, numCols);
146 } else {
147 d_view = dev_view_type(view_alloc(label),
148 lclNumRows, numCols);
149 }
150 } else {
151 if (allowPadding) {
152 d_view = dev_view_type(view_alloc(label,
153 WithoutInitializing,
154 AllowPadding),
155 lclNumRows, numCols);
156 } else {
157 d_view = dev_view_type(view_alloc(label, WithoutInitializing),
158 lclNumRows, numCols);
159 }
160 if (debug) {
161 // Filling with NaN is a cheap and effective way to tell if
162 // downstream code is trying to use a MultiVector's data
163 // without them having been initialized. ArithTraits lets us
164 // call nan() even if the scalar type doesn't define it; it
165 // just returns some undefined value in the latter case. This
166 // won't hurt anything because by setting zeroOut=false, users
167 // already agreed that they don't care about the contents of
168 // the MultiVector.
169 const ST nan = KokkosKernels::ArithTraits<ST>::nan();
170 KokkosBlas::fill(d_view, nan);
171 }
172 }
173 if (debug) {
174 TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(d_view.extent(0)) != lclNumRows ||
175 static_cast<size_t>(d_view.extent(1)) != numCols,
176 std::logic_error,
177 "allocDualView: d_view's dimensions actual dimensions do not match "
178 "requested dimensions. d_view is "
179 << d_view.extent(0) << " x " << d_view.extent(1) << "; requested " << lclNumRows << " x " << numCols
180 << ". Please report this bug to the Tpetra developers.");
181 }
182
183 return wrapped_dual_view_type(d_view);
184}
185
186// Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
187//
188// T: The type of the entries of the View.
189// ExecSpace: The Kokkos execution space.
190template <class T, class ExecSpace>
191struct MakeUnmanagedView {
192 // The 'false' part of the branch carefully ensures that this
193 // won't attempt to use a host execution space that hasn't been
194 // initialized. For example, if Kokkos::OpenMP is disabled and
195 // Kokkos::Threads is enabled, the latter is always the default
196 // execution space of Kokkos::HostSpace, even when ExecSpace is
197 // Kokkos::Serial. That's why we go through the trouble of asking
198 // Kokkos::DualView what _its_ space is. That seems to work
199 // around this default execution space issue.
200 //
201 typedef typename std::conditional<
202 Kokkos::SpaceAccessibility<
203 typename ExecSpace::memory_space,
204 Kokkos::HostSpace>::accessible,
205 typename ExecSpace::device_type,
206 typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
207 typedef Kokkos::LayoutLeft array_layout;
208 typedef Kokkos::View<T*, array_layout, host_exec_space,
209 Kokkos::MemoryUnmanaged>
210 view_type;
211
212 static view_type getView(const Teuchos::ArrayView<T>& x_in) {
213 const size_t numEnt = static_cast<size_t>(x_in.size());
214 if (numEnt == 0) {
215 return view_type();
216 } else {
217 return view_type(x_in.getRawPtr(), numEnt);
218 }
219 }
220};
221
222template <class WrappedDualViewType>
223WrappedDualViewType
224takeSubview(const WrappedDualViewType& X,
225 const std::pair<size_t, size_t>& rowRng,
226 const Kokkos::ALL_t& colRng)
227
228{
229 // The bug we saw below should be harder to trigger here.
230 return WrappedDualViewType(X, rowRng, colRng);
231}
232
233template <class WrappedDualViewType>
234WrappedDualViewType
235takeSubview(const WrappedDualViewType& X,
236 const Kokkos::ALL_t& rowRng,
237 const std::pair<size_t, size_t>& colRng) {
238 using DualViewType = typename WrappedDualViewType::DVT;
239 // Look carefullly at the comment in the below version of this function.
240 // The comment applies here as well.
241 if (X.extent(0) == 0 && X.extent(1) != 0) {
242 return WrappedDualViewType(DualViewType("MV::DualView", 0, colRng.second - colRng.first));
243 } else {
244 return WrappedDualViewType(X, rowRng, colRng);
245 }
246}
247
248template <class WrappedDualViewType>
249WrappedDualViewType
250takeSubview(const WrappedDualViewType& X,
251 const std::pair<size_t, size_t>& rowRng,
252 const std::pair<size_t, size_t>& colRng) {
253 using DualViewType = typename WrappedDualViewType::DVT;
254 // If you take a subview of a view with zero rows Kokkos::subview()
255 // always returns a DualView with the same data pointers. This will break
256 // pointer equality testing in between two subviews of the same 2D View if
257 // it has zero row extent. While the one (known) case where this was actually used
258 // has been fixed, that sort of check could very easily be reintroduced in the future,
259 // hence I've added this if check here.
260 //
261 // This is not a bug in Kokkos::subview(), just some very subtle behavior which
262 // future developers should be wary of.
263 if (X.extent(0) == 0 && X.extent(1) != 0) {
264 return WrappedDualViewType(DualViewType("MV::DualView", 0, colRng.second - colRng.first));
265 } else {
266 return WrappedDualViewType(X, rowRng, colRng);
267 }
268}
269
270template <class WrappedOrNotDualViewType>
271size_t
272getDualViewStride(const WrappedOrNotDualViewType& dv) {
273 // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
274 // method yet, but its Views do.
275 // NOTE: dv.stride() returns a vector of length one
276 // more than its rank
277 size_t strides[WrappedOrNotDualViewType::t_dev::rank + 1];
278 dv.stride(strides);
279 const size_t LDA = strides[1];
280 const size_t numRows = dv.extent(0);
281
282 if (LDA == 0) {
283 return (numRows == 0) ? size_t(1) : numRows;
284 } else {
285 return LDA;
286 }
287}
288
289template <class ViewType>
290size_t
291getViewStride(const ViewType& view) {
292 const size_t LDA = view.stride(1);
293 const size_t numRows = view.extent(0);
294
295 if (LDA == 0) {
296 return (numRows == 0) ? size_t(1) : numRows;
297 } else {
298 return LDA;
299 }
300}
301
302template <class impl_scalar_type, class buffer_device_type>
303bool runKernelOnHost(
304 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports) {
305 if (!imports.need_sync_device()) {
306 return false; // most up-to-date on device
307 } else { // most up-to-date on host,
308 // but if large enough, worth running on device anyway
309 size_t localLengthThreshold =
311 return imports.extent(0) <= localLengthThreshold;
312 }
313}
314
315template <class SC, class LO, class GO, class NT>
316bool runKernelOnHost(const ::Tpetra::MultiVector<SC, LO, GO, NT>& X) {
317 if (!X.need_sync_device()) {
318 return false; // most up-to-date on device
319 } else { // most up-to-date on host
320 // but if large enough, worth running on device anyway
321 size_t localLengthThreshold =
323 return X.getLocalLength() <= localLengthThreshold;
324 }
325}
326
327template <class SC, class LO, class GO, class NT>
328void multiVectorNormImpl(typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
330 const ::Tpetra::Details::EWhichNorm whichNorm) {
331 using namespace Tpetra;
332 using ::Tpetra::Details::normImpl;
334 using val_type = typename MV::impl_scalar_type;
335 using mag_type = typename MV::mag_type;
336 using dual_view_type = typename MV::dual_view_type;
337
338 auto map = X.getMap();
339 auto comm = map.is_null() ? nullptr : map->getComm().getRawPtr();
340 auto whichVecs = getMultiVectorWhichVectors(X);
341 const bool isConstantStride = X.isConstantStride();
342 const bool isDistributed = X.isDistributed();
343
344 const bool runOnHost = runKernelOnHost(X);
345 if (runOnHost) {
346 using view_type = typename dual_view_type::t_host;
347 using array_layout = typename view_type::array_layout;
348 using device_type = typename view_type::device_type;
349
350 auto X_lcl = X.getLocalViewHost(Access::ReadOnly);
351 normImpl<val_type, array_layout, device_type,
352 mag_type>(norms, X_lcl, whichNorm, whichVecs,
353 isConstantStride, isDistributed, comm);
354 } else {
355 using view_type = typename dual_view_type::t_dev;
356 using array_layout = typename view_type::array_layout;
357 using device_type = typename view_type::device_type;
358
359 auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
360 normImpl<val_type, array_layout, device_type,
361 mag_type>(norms, X_lcl, whichNorm, whichVecs,
362 isConstantStride, isDistributed, comm);
363 }
364}
365} // namespace
366
367namespace Tpetra {
368
369namespace Details {
370template <typename DstView, typename SrcView>
371struct AddAssignFunctor {
372 // This functor would be better as a lambda, but CUDA cannot compile
373 // lambdas in protected functions. It compiles fine with the functor.
374 AddAssignFunctor(DstView& tgt_, SrcView& src_)
375 : tgt(tgt_)
376 , src(src_) {}
377
379 operator()(const size_t k) const { tgt(k) += src(k); }
380
381 DstView tgt;
382 SrcView src;
383};
384} // namespace Details
385
386template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
388 vectorIndexOutOfRange(const size_t VectorIndex) const {
389 return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
390}
391
392template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
396
397template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
399 MultiVector(const Teuchos::RCP<const map_type>& map,
400 const size_t numVecs,
401 const bool zeroOut)
402 : /* default is true */
403 base_type(map) {
404 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,numVecs,zeroOut)");
405
406 const size_t lclNumRows = this->getLocalLength();
408}
409
410template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
413 const Teuchos::DataAccess copyOrView)
415 , view_(source.view_)
416 , whichVectors_(source.whichVectors_) {
418 const char tfecfFuncName[] =
419 "MultiVector(const MultiVector&, "
420 "const Teuchos::DataAccess): ";
421
422 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV 2-arg \"copy\" ctor");
423
424 if (copyOrView == Teuchos::Copy) {
425 // Reuse the conveniently already existing function that creates
426 // a deep copy.
427 MV cpy = createCopy(source);
428 this->view_ = cpy.view_;
429 this->whichVectors_ = cpy.whichVectors_;
430 } else if (copyOrView == Teuchos::View) {
431 } else {
433 true, std::invalid_argument,
434 "Second argument 'copyOrView' has an "
435 "invalid value "
436 << copyOrView << ". Valid values include "
437 "Teuchos::Copy = "
438 << Teuchos::Copy << " and Teuchos::View = "
439 << Teuchos::View << ".");
440 }
441}
442
443template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
445 MultiVector(const Teuchos::RCP<const map_type>& map,
446 const dual_view_type& view)
448 , view_(wrapped_dual_view_type(view)) {
449 const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
450 const size_t lclNumRows_map = map.is_null() ? size_t(0) : map->getLocalNumElements();
451 const size_t lclNumRows_view = view.extent(0);
452 const size_t LDA = getDualViewStride(view_);
453
455 std::invalid_argument,
456 "Kokkos::DualView does not match Map. "
457 "map->getLocalNumElements() = "
459 << ", view.extent(0) = " << lclNumRows_view
460 << ", and getStride() = " << LDA << ".");
462 using ::Tpetra::Details::Behavior;
463 const bool debug = Behavior::debug();
464 if (debug) {
465 using ::Tpetra::Details::checkGlobalDualViewValidity;
466 std::ostringstream gblErrStrm;
467 const bool verbose = Behavior::verbose();
468 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
469 const bool gblValid =
470 checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
471 comm.getRawPtr());
472 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
473 }
474}
475
476template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
478 MultiVector(const Teuchos::RCP<const map_type>& map,
480 : base_type(map)
481 , view_(view) {
482 const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
483 const size_t lclNumRows_map = map.is_null() ? size_t(0) : map->getLocalNumElements();
484 const size_t lclNumRows_view = view.extent(0);
485 const size_t LDA = getDualViewStride(view);
486
488 std::invalid_argument,
489 "Kokkos::DualView does not match Map. "
490 "map->getLocalNumElements() = "
492 << ", view.extent(0) = " << lclNumRows_view
493 << ", and getStride() = " << LDA << ".");
494
495 using ::Tpetra::Details::Behavior;
496 const bool debug = Behavior::debug();
497 if (debug) {
498 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
499 std::ostringstream gblErrStrm;
500 const bool verbose = Behavior::verbose();
501 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
502 const bool gblValid =
503 checkGlobalWrappedDualViewValidity(&gblErrStrm, view, verbose,
504 comm.getRawPtr());
505 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
506 }
507}
508
509template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
511 MultiVector(const Teuchos::RCP<const map_type>& map,
512 const typename dual_view_type::t_dev& d_view)
514 using Teuchos::ArrayRCP;
515 using Teuchos::RCP;
516 const char tfecfFuncName[] = "MultiVector(map,d_view): ";
517
518 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,d_view)");
519
520 const size_t LDA = getViewStride(d_view);
521 const size_t lclNumRows = map->getLocalNumElements();
523 "Map does not match "
524 "Kokkos::View. map->getLocalNumElements() = "
525 << lclNumRows
526 << ", View's column stride = " << LDA
527 << ", and View's extent(0) = " << d_view.extent(0) << ".");
528
529 auto h_view = Kokkos::create_mirror_view(d_view);
532
533 using ::Tpetra::Details::Behavior;
534 const bool debug = Behavior::debug();
535 if (debug) {
536 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
537 std::ostringstream gblErrStrm;
538 const bool verbose = Behavior::verbose();
539 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
540 const bool gblValid =
541 checkGlobalWrappedDualViewValidity(&gblErrStrm, view_, verbose,
542 comm.getRawPtr());
543 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
544 }
545 // The user gave us a device view. In order to respect its
546 // initial contents, we mark the DualView as "modified on device."
547 // That way, the next sync will synchronize it with the host view.
548 dual_view.modify_device();
549}
550
551template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
553 MultiVector(const Teuchos::RCP<const map_type>& map,
554 const dual_view_type& view,
556 : base_type(map)
557 , view_(wrapped_dual_view_type(view, origView)) {
558 const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
559
560 const size_t LDA = getDualViewStride(origView);
561 const size_t lclNumRows = this->getLocalLength(); // comes from the Map
563 LDA < lclNumRows, std::invalid_argument,
564 "The input Kokkos::DualView's "
565 "column stride LDA = "
566 << LDA << " < getLocalLength() = " << lclNumRows
567 << ". This may also mean that the input origView's first dimension (number "
568 "of rows = "
569 << origView.extent(0) << ") does not not match the number "
570 "of entries in the Map on the calling process.");
571
572 using ::Tpetra::Details::Behavior;
573 const bool debug = Behavior::debug();
574 if (debug) {
575 using ::Tpetra::Details::checkGlobalDualViewValidity;
576 std::ostringstream gblErrStrm;
577 const bool verbose = Behavior::verbose();
578 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
579 const bool gblValid_0 =
580 checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
581 comm.getRawPtr());
582 const bool gblValid_1 =
583 checkGlobalDualViewValidity(&gblErrStrm, origView, verbose,
584 comm.getRawPtr());
585 const bool gblValid = gblValid_0 && gblValid_1;
586 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
587 }
588}
589
590template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
592 MultiVector(const Teuchos::RCP<const map_type>& map,
593 const dual_view_type& view,
594 const Teuchos::ArrayView<const size_t>& whichVectors)
595 : base_type(map)
596 , view_(view)
597 , whichVectors_(whichVectors.begin(), whichVectors.end()) {
598 using Kokkos::ALL;
599 using Kokkos::subview;
600 const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
601
602 using ::Tpetra::Details::Behavior;
603 const bool debug = Behavior::debug();
604 if (debug) {
605 using ::Tpetra::Details::checkGlobalDualViewValidity;
606 std::ostringstream gblErrStrm;
607 const bool verbose = Behavior::verbose();
608 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
609 const bool gblValid =
610 checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
611 comm.getRawPtr());
612 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
613 }
614
615 const size_t lclNumRows = map.is_null() ? size_t(0) : map->getLocalNumElements();
616 // Check dimensions of the input DualView. We accept that Kokkos
617 // might not allow construction of a 0 x m (Dual)View with m > 0,
618 // so we only require the number of rows to match if the
619 // (Dual)View has more than zero columns. Likewise, we only
620 // require the number of columns to match if the (Dual)View has
621 // more than zero rows.
623 view.extent(1) != 0 && static_cast<size_t>(view.extent(0)) < lclNumRows,
624 std::invalid_argument, "view.extent(0) = " << view.extent(0) << " < map->getLocalNumElements() = " << lclNumRows << ".");
625 if (whichVectors.size() != 0) {
627 view.extent(1) != 0 && view.extent(1) == 0,
628 std::invalid_argument,
629 "view.extent(1) = 0, but whichVectors.size()"
630 " = "
631 << whichVectors.size() << " > 0.");
632 size_t maxColInd = 0;
633 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
634 for (size_type k = 0; k < whichVectors.size(); ++k) {
636 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid(),
637 std::invalid_argument, "whichVectors[" << k << "] = "
638 "Teuchos::OrdinalTraits<size_t>::invalid().");
639 maxColInd = std::max(maxColInd, whichVectors[k]);
640 }
642 view.extent(1) != 0 && static_cast<size_t>(view.extent(1)) <= maxColInd,
643 std::invalid_argument, "view.extent(1) = " << view.extent(1) << " <= max(whichVectors) = " << maxColInd << ".");
644 }
645
646 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
647 // zero strides, so modify in that case.
648 const size_t LDA = getDualViewStride(view);
650 "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
651
652 if (whichVectors.size() == 1) {
653 // If whichVectors has only one entry, we don't need to bother
654 // with nonconstant stride. Just shift the view over so it
655 // points to the desired column.
656 //
657 // NOTE (mfh 10 May 2014) This is a special case where we set
658 // origView_ just to view that one column, not all of the
659 // original columns. This ensures that the use of origView_ in
660 // offsetView works correctly.
661 //
662 const std::pair<size_t, size_t> colRng(whichVectors[0],
663 whichVectors[0] + 1);
665 // whichVectors_.size() == 0 means "constant stride."
666 whichVectors_.clear();
667 }
668}
669
670template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
672 MultiVector(const Teuchos::RCP<const map_type>& map,
673 const wrapped_dual_view_type& view,
674 const Teuchos::ArrayView<const size_t>& whichVectors)
675 : base_type(map)
676 , view_(view)
677 , whichVectors_(whichVectors.begin(), whichVectors.end()) {
678 using Kokkos::ALL;
679 using Kokkos::subview;
680 const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
681
682 using ::Tpetra::Details::Behavior;
683 const bool debug = Behavior::debug();
684 if (debug) {
685 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
686 std::ostringstream gblErrStrm;
687 const bool verbose = Behavior::verbose();
688 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
689 const bool gblValid =
690 checkGlobalWrappedDualViewValidity(&gblErrStrm, view, verbose,
691 comm.getRawPtr());
692 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
693 }
694
695 const size_t lclNumRows = map.is_null() ? size_t(0) : map->getLocalNumElements();
696 // Check dimensions of the input DualView. We accept that Kokkos
697 // might not allow construction of a 0 x m (Dual)View with m > 0,
698 // so we only require the number of rows to match if the
699 // (Dual)View has more than zero columns. Likewise, we only
700 // require the number of columns to match if the (Dual)View has
701 // more than zero rows.
703 view.extent(1) != 0 && static_cast<size_t>(view.extent(0)) < lclNumRows,
704 std::invalid_argument, "view.extent(0) = " << view.extent(0) << " < map->getLocalNumElements() = " << lclNumRows << ".");
705 if (whichVectors.size() != 0) {
707 view.extent(1) != 0 && view.extent(1) == 0,
708 std::invalid_argument,
709 "view.extent(1) = 0, but whichVectors.size()"
710 " = "
711 << whichVectors.size() << " > 0.");
712 size_t maxColInd = 0;
713 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
714 for (size_type k = 0; k < whichVectors.size(); ++k) {
716 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid(),
717 std::invalid_argument, "whichVectors[" << k << "] = "
718 "Teuchos::OrdinalTraits<size_t>::invalid().");
719 maxColInd = std::max(maxColInd, whichVectors[k]);
720 }
722 view.extent(1) != 0 && static_cast<size_t>(view.extent(1)) <= maxColInd,
723 std::invalid_argument, "view.extent(1) = " << view.extent(1) << " <= max(whichVectors) = " << maxColInd << ".");
724 }
725
726 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
727 // zero strides, so modify in that case.
728 const size_t LDA = getDualViewStride(view);
730 "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
731
732 if (whichVectors.size() == 1) {
733 // If whichVectors has only one entry, we don't need to bother
734 // with nonconstant stride. Just shift the view over so it
735 // points to the desired column.
736 //
737 // NOTE (mfh 10 May 2014) This is a special case where we set
738 // origView_ just to view that one column, not all of the
739 // original columns. This ensures that the use of origView_ in
740 // offsetView works correctly.
741 //
742 const std::pair<size_t, size_t> colRng(whichVectors[0],
743 whichVectors[0] + 1);
744 view_ = takeSubview(view_, ALL(), colRng);
745 // whichVectors_.size() == 0 means "constant stride."
746 whichVectors_.clear();
747 }
748}
749
750template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
752 MultiVector(const Teuchos::RCP<const map_type>& map,
753 const dual_view_type& view,
755 const Teuchos::ArrayView<const size_t>& whichVectors)
756 : base_type(map)
757 , view_(wrapped_dual_view_type(view, origView))
758 , whichVectors_(whichVectors.begin(), whichVectors.end()) {
759 using Kokkos::ALL;
760 using Kokkos::subview;
761 const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
762
763 using ::Tpetra::Details::Behavior;
764 const bool debug = Behavior::debug();
765 if (debug) {
766 using ::Tpetra::Details::checkGlobalDualViewValidity;
767 std::ostringstream gblErrStrm;
768 const bool verbose = Behavior::verbose();
769 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
770 const bool gblValid_0 =
771 checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
772 comm.getRawPtr());
773 const bool gblValid_1 =
774 checkGlobalDualViewValidity(&gblErrStrm, origView, verbose,
775 comm.getRawPtr());
776 const bool gblValid = gblValid_0 && gblValid_1;
777 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
778 }
779
780 const size_t lclNumRows = this->getLocalLength();
781 // Check dimensions of the input DualView. We accept that Kokkos
782 // might not allow construction of a 0 x m (Dual)View with m > 0,
783 // so we only require the number of rows to match if the
784 // (Dual)View has more than zero columns. Likewise, we only
785 // require the number of columns to match if the (Dual)View has
786 // more than zero rows.
788 view.extent(1) != 0 && static_cast<size_t>(view.extent(0)) < lclNumRows,
789 std::invalid_argument, "view.extent(0) = " << view.extent(0) << " < map->getLocalNumElements() = " << lclNumRows << ".");
790 if (whichVectors.size() != 0) {
792 view.extent(1) != 0 && view.extent(1) == 0,
793 std::invalid_argument,
794 "view.extent(1) = 0, but whichVectors.size()"
795 " = "
796 << whichVectors.size() << " > 0.");
797 size_t maxColInd = 0;
798 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
799 for (size_type k = 0; k < whichVectors.size(); ++k) {
801 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid(),
802 std::invalid_argument, "whichVectors[" << k << "] = "
803 "Teuchos::OrdinalTraits<size_t>::invalid().");
804 maxColInd = std::max(maxColInd, whichVectors[k]);
805 }
807 view.extent(1) != 0 && static_cast<size_t>(view.extent(1)) <= maxColInd,
808 std::invalid_argument, "view.extent(1) = " << view.extent(1) << " <= max(whichVectors) = " << maxColInd << ".");
809 }
810
811 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
812 // zero strides, so modify in that case.
813 const size_t LDA = getDualViewStride(origView);
815 "Map and DualView origView "
816 "do not match. LDA = "
817 << LDA << " < this->getLocalLength() = " << lclNumRows << ". origView.extent(0) = " << origView.extent(0)
818 << ", origView.stride(1) = " << origView.view_device().stride(1) << ".");
819
820 if (whichVectors.size() == 1) {
821 // If whichVectors has only one entry, we don't need to bother
822 // with nonconstant stride. Just shift the view over so it
823 // points to the desired column.
824 //
825 // NOTE (mfh 10 May 2014) This is a special case where we set
826 // origView_ just to view that one column, not all of the
827 // original columns. This ensures that the use of origView_ in
828 // offsetView works correctly.
829 const std::pair<size_t, size_t> colRng(whichVectors[0],
830 whichVectors[0] + 1);
832 // origView_ = takeSubview (origView_, ALL (), colRng);
833 // whichVectors_.size() == 0 means "constant stride."
834 whichVectors_.clear();
835 }
836}
838template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
840 MultiVector(const Teuchos::RCP<const map_type>& map,
841 const Teuchos::ArrayView<const Scalar>& data,
842 const size_t LDA,
843 const size_t numVecs)
844 : base_type(map) {
845 typedef LocalOrdinal LO;
846 typedef GlobalOrdinal GO;
847 const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
848 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
849
850 // Deep copy constructor, constant stride (NO whichVectors_).
851 // There is no need for a deep copy constructor with nonconstant stride.
852
853 const size_t lclNumRows =
854 map.is_null() ? size_t(0) : map->getLocalNumElements();
855 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
856 "map->getLocalNumElements() = "
857 << lclNumRows << ".");
858 if (numVecs != 0) {
859 const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
860 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(data.size()) < minNumEntries,
861 std::invalid_argument,
862 "Input Teuchos::ArrayView does not have enough "
863 "entries, given the input Map and number of vectors in the MultiVector."
864 " data.size() = "
865 << data.size() << " < (LDA*(numVecs-1)) + "
866 "map->getLocalNumElements () = "
867 << minNumEntries << ".");
868 }
869
871 // Note: X_out will be completely overwritten
872 auto X_out = this->getLocalViewDevice(Access::OverwriteAll);
873
874 // Make an unmanaged host Kokkos::View of the input data. First
875 // create a View (X_in_orig) with the original stride. Then,
876 // create a subview (X_in) with the right number of columns.
877 const impl_scalar_type* const X_in_raw =
878 reinterpret_cast<const impl_scalar_type*>(data.getRawPtr());
879 Kokkos::View<const impl_scalar_type**,
880 Kokkos::LayoutLeft,
881 Kokkos::HostSpace,
882 Kokkos::MemoryUnmanaged>
884 const Kokkos::pair<size_t, size_t> rowRng(0, lclNumRows);
885 auto X_in = Kokkos::subview(X_in_orig, rowRng, Kokkos::ALL());
886
887 // If LDA != X_out's column stride, then we need to copy one
888 // column at a time; Kokkos::deep_copy refuses to work in that
889 // case.
890 const size_t outStride =
891 X_out.extent(1) == 0 ? size_t(1) : X_out.stride(1);
892 if (LDA == outStride) { // strides are the same; deep_copy once
893 // This only works because MultiVector uses LayoutLeft.
894 // We would need a custom copy functor otherwise.
895 // DEEP_COPY REVIEW - HOST-TO-DEVICE
896 Kokkos::deep_copy(execution_space(), X_out, X_in);
897 } else { // strides differ; copy one column at a time
898 typedef decltype(Kokkos::subview(X_out, Kokkos::ALL(), 0))
900 typedef decltype(Kokkos::subview(X_in, Kokkos::ALL(), 0))
902 for (size_t j = 0; j < numVecs; ++j) {
903 out_col_view_type X_out_j = Kokkos::subview(X_out, Kokkos::ALL(), j);
904 in_col_view_type X_in_j = Kokkos::subview(X_in, Kokkos::ALL(), j);
905 // DEEP_COPY REVIEW - HOST-TO-DEVICE
906 Kokkos::deep_copy(execution_space(), X_out_j, X_in_j);
907 }
908 }
909}
910
911template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
913 MultiVector(const Teuchos::RCP<const map_type>& map,
914 const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
915 const size_t numVecs)
916 : base_type(map) {
917 typedef impl_scalar_type IST;
918 typedef LocalOrdinal LO;
919 typedef GlobalOrdinal GO;
920 const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
921 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
922
923 const size_t lclNumRows =
924 map.is_null() ? size_t(0) : map->getLocalNumElements();
925 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numVecs < 1 || numVecs != static_cast<size_t>(ArrayOfPtrs.size()),
926 std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
927 "ArrayOfPtrs.size() (= "
928 << ArrayOfPtrs.size() << ") != numVecs.");
929 for (size_t j = 0; j < numVecs; ++j) {
930 Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
932 static_cast<size_t>(X_j_av.size()) < lclNumRows,
933 std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = " << X_j_av.size() << " < map->getLocalNumElements() = " << lclNumRows << ".");
934 }
935
937 auto X_out = this->getLocalViewDevice(Access::ReadWrite);
938
939 // Make sure that the type of a single input column has the same
940 // array layout as each output column does, so we can deep_copy.
941 using array_layout = typename decltype(X_out)::array_layout;
942 using input_col_view_type = typename Kokkos::View<const IST*,
943 array_layout,
944 Kokkos::HostSpace,
945 Kokkos::MemoryUnmanaged>;
946
947 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
948 for (size_t j = 0; j < numVecs; ++j) {
949 Teuchos::ArrayView<const IST> X_j_av =
950 Teuchos::av_reinterpret_cast<const IST>(ArrayOfPtrs[j]);
952 auto X_j_out = Kokkos::subview(X_out, rowRng, j);
953 // DEEP_COPY REVIEW - HOST-TO-DEVICE
954 Kokkos::deep_copy(execution_space(), X_j_out, X_j_in);
955 }
956}
957
958template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
960 isConstantStride() const {
961 return whichVectors_.empty();
962}
963
964template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
965size_t
967 getLocalLength() const {
968 if (this->getMap().is_null()) { // possible, due to replaceMap().
969 return static_cast<size_t>(0);
970 } else {
971 return this->getMap()->getLocalNumElements();
972 }
973}
974
975template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
979 if (this->getMap().is_null()) { // possible, due to replaceMap().
980 return static_cast<size_t>(0);
981 } else {
982 return this->getMap()->getGlobalNumElements();
983 }
984}
985
986template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
987size_t
989 getStride() const {
990 return isConstantStride() ? getDualViewStride(view_) : size_t(0);
991}
992
993template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
996 // Don't actually get a view, just get pointers.
997 auto thisData = view_.getDualView().view_host().data();
998 auto otherData = other.view_.getDualView().view_host().data();
999 size_t thisSpan = view_.getDualView().view_host().span();
1000 size_t otherSpan = other.view_.getDualView().view_host().span();
1002}
1003
1004template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1007 // Check whether the source object is a MultiVector. If not, then
1008 // we can't even compare sizes, so it's definitely not OK to
1009 // Import or Export from it.
1011 const MV* src = dynamic_cast<const MV*>(&sourceObj);
1012 if (src == nullptr) {
1013 return false;
1014 } else {
1015 // The target of the Import or Export calls checkSizes() in
1016 // DistObject::doTransfer(). By that point, we've already
1017 // constructed an Import or Export object using the two
1018 // multivectors' Maps, which means that (hopefully) we've
1019 // already checked other attributes of the multivectors. Thus,
1020 // all we need to do here is check the number of columns.
1021 return src->getNumVectors() == this->getNumVectors();
1022 }
1023}
1024
1025template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1026size_t
1029 return this->getNumVectors();
1030}
1031
1032template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1035 const size_t numSameIDs,
1036 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1037 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1038 const CombineMode CM,
1039 const execution_space& space) {
1040 using Kokkos::Compat::create_const_view;
1041 using KokkosRefactor::Details::permute_array_multi_column;
1042 using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
1043 using std::endl;
1044 using ::Tpetra::Details::Behavior;
1045 using ::Tpetra::Details::getDualViewCopyFromArrayView;
1046 using ::Tpetra::Details::ProfilingRegion;
1049 // We've already called checkSizes(), so this cast must succeed.
1050 MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&>(sourceObj));
1051
1052 bool copyOnHost;
1053 if (importsAreAliased() && (this->constantNumberOfPackets() != 0) && Behavior::enableGranularTransfers()) {
1054 // imports are aliased to the target. We already posted Irecvs
1055 // into imports using a memory space that depends on GPU
1056 // awareness. Therefore we want to modify target in the same
1057 // memory space. See comments in DistObject::beginTransfer.
1058 copyOnHost = !Behavior::assumeMpiIsGPUAware();
1059 } else {
1060 // We are free to choose the better memory space for copyAndPermute.
1062 }
1063 const char longFuncNameHost[] = "Tpetra::MultiVector::copyAndPermute[Host]";
1064 const char longFuncNameDevice[] = "Tpetra::MultiVector::copyAndPermute[Device]";
1065 const char tfecfFuncName[] = "copyAndPermute: ";
1067
1068 const bool verbose = Behavior::verbose();
1069 std::unique_ptr<std::string> prefix;
1070 if (verbose) {
1071 auto map = this->getMap();
1072 auto comm = map.is_null() ? Teuchos::null : map->getComm();
1073 const int myRank = comm.is_null() ? -1 : comm->getRank();
1074 std::ostringstream os;
1075 os << "Proc " << myRank << ": MV::copyAndPermute: ";
1076 prefix = std::unique_ptr<std::string>(new std::string(os.str()));
1077 os << "Start" << endl;
1078 std::cerr << os.str();
1079 }
1080
1082 std::logic_error, "permuteToLIDs.extent(0) = " << permuteToLIDs.extent(0) << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0) << ".");
1083 const size_t numCols = this->getNumVectors();
1084
1085 // sourceMV doesn't belong to us, so we can't sync it. Do the
1086 // copying where it's currently sync'd.
1087 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(sourceMV.need_sync_device() && sourceMV.need_sync_host(),
1088 std::logic_error,
1089 "Input MultiVector needs sync to both host "
1090 "and device.");
1091 if (verbose) {
1092 std::ostringstream os;
1093 os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1094 std::cerr << os.str();
1095 }
1096
1097 if (verbose) {
1098 std::ostringstream os;
1099 os << *prefix << "Copy first " << numSameIDs << " elements" << endl;
1100 std::cerr << os.str();
1101 }
1102
1103 // TODO (mfh 15 Sep 2013) When we replace
1104 // KokkosClassic::MultiVector with a Kokkos::View, there are two
1105 // ways to copy the data:
1106 //
1107 // 1. Get a (sub)view of each column and call deep_copy on that.
1108 // 2. Write a custom kernel to copy the data.
1109 //
1110 // The first is easier, but the second might be more performant in
1111 // case we decide to use layouts other than LayoutLeft. It might
1112 // even make sense to hide whichVectors_ in an entirely new layout
1113 // for Kokkos Views.
1114
1115 // Copy rows [0, numSameIDs-1] of the local multivectors.
1116 //
1117 // Note (ETP 2 Jul 2014) We need to always copy one column at a
1118 // time, even when both multivectors are constant-stride, since
1119 // deep_copy between strided subviews with more than one column
1120 // doesn't currently work.
1121
1122 // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1123 // both source and target are constant stride and have multiple
1124 // columns.
1125 if (numSameIDs > 0) {
1126 const std::pair<size_t, size_t> rows(0, numSameIDs);
1127 if (copyOnHost) {
1128 ProfilingRegion regionC("Tpetra::MultiVector::copy[Host]");
1129 auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1130 auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1131
1132 for (size_t j = 0; j < numCols; ++j) {
1133 const size_t tgtCol = isConstantStride() ? j : whichVectors_[j];
1134 const size_t srcCol =
1135 sourceMV.isConstantStride() ? j : sourceMV.whichVectors_[j];
1136
1137 auto tgt_j = Kokkos::subview(tgt_h, rows, tgtCol);
1138 auto src_j = Kokkos::subview(src_h, rows, srcCol);
1139 if (CM == ADD_ASSIGN) {
1140 // Sum src_j into tgt_j
1141 using range_t =
1142 Kokkos::RangePolicy<execution_space, size_t>;
1143 range_t rp(space, 0, numSameIDs);
1144 Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1145 aaf(tgt_j, src_j);
1146 Kokkos::parallel_for(rp, aaf);
1147 } else {
1148 // Copy src_j into tgt_j
1149 // DEEP_COPY REVIEW - HOSTMIRROR-TO-HOSTMIRROR
1150 Kokkos::deep_copy(space, tgt_j, src_j);
1151 space.fence("Tpetra::MultiVector::copyAndPermute-1");
1152 }
1153 }
1154 } else { // copy on device
1155 ProfilingRegion regionC("Tpetra::MultiVector::copy[Device]");
1156 auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1157 auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1158
1159 for (size_t j = 0; j < numCols; ++j) {
1160 const size_t tgtCol = isConstantStride() ? j : whichVectors_[j];
1161 const size_t srcCol =
1162 sourceMV.isConstantStride() ? j : sourceMV.whichVectors_[j];
1163
1164 auto tgt_j = Kokkos::subview(tgt_d, rows, tgtCol);
1165 auto src_j = Kokkos::subview(src_d, rows, srcCol);
1166 if (CM == ADD_ASSIGN) {
1167 // Sum src_j into tgt_j
1168 using range_t =
1169 Kokkos::RangePolicy<execution_space, size_t>;
1170 range_t rp(space, 0, numSameIDs);
1171 Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1172 aaf(tgt_j, src_j);
1173 Kokkos::parallel_for(rp, aaf);
1174 } else {
1175 // Copy src_j into tgt_j
1176 // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
1177 Kokkos::deep_copy(space, tgt_j, src_j);
1178 space.fence("Tpetra::MultiVector::copyAndPermute-2");
1179 }
1180 }
1181 }
1182 }
1183
1184 // For the remaining GIDs, execute the permutations. This may
1185 // involve noncontiguous access of both source and destination
1186 // vectors, depending on the LID lists.
1187 //
1188 // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1189 // the same process, this merges their values by replacement of
1190 // the last encountered GID, not by the specified merge rule
1191 // (such as ADD).
1192
1193 // If there are no permutations, we are done
1194 if (permuteFromLIDs.extent(0) == 0 ||
1195 permuteToLIDs.extent(0) == 0) {
1196 if (verbose) {
1197 std::ostringstream os;
1198 os << *prefix << "No permutations. Done!" << endl;
1199 std::cerr << os.str();
1200 }
1201 return;
1202 }
1203 ProfilingRegion regionP("Tpetra::MultiVector::permute");
1204
1205 if (verbose) {
1206 std::ostringstream os;
1207 os << *prefix << "Permute" << endl;
1208 std::cerr << os.str();
1209 }
1210
1211 // We could in theory optimize for the case where exactly one of
1212 // them is constant stride, but we don't currently do that.
1213 const bool nonConstStride =
1214 !this->isConstantStride() || !sourceMV.isConstantStride();
1215
1216 if (verbose) {
1217 std::ostringstream os;
1218 os << *prefix << "nonConstStride="
1219 << (nonConstStride ? "true" : "false") << endl;
1220 std::cerr << os.str();
1221 }
1222
1223 // We only need the "which vectors" arrays if either the source or
1224 // target MV is not constant stride. Since we only have one
1225 // kernel that must do double-duty, we have to create a "which
1226 // vectors" array for the MV that _is_ constant stride.
1227 Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1228 Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1229 if (nonConstStride) {
1230 if (this->whichVectors_.size() == 0) {
1231 Kokkos::DualView<size_t*, device_type> tmpTgt("tgtWhichVecs", numCols);
1232 tmpTgt.modify_host();
1233 for (size_t j = 0; j < numCols; ++j) {
1234 tmpTgt.view_host()(j) = j;
1235 }
1236 if (!copyOnHost) {
1237 tmpTgt.sync_device();
1238 }
1240 } else {
1241 Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_();
1242 tgtWhichVecs =
1244 "tgtWhichVecs",
1245 copyOnHost);
1246 }
1247 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(tgtWhichVecs.extent(0)) !=
1248 this->getNumVectors(),
1249 std::logic_error, "tgtWhichVecs.extent(0) = " << tgtWhichVecs.extent(0) << " != this->getNumVectors() = " << this->getNumVectors() << ".");
1250
1251 if (sourceMV.whichVectors_.size() == 0) {
1252 Kokkos::DualView<size_t*, device_type> tmpSrc("srcWhichVecs", numCols);
1253 tmpSrc.modify_host();
1254 for (size_t j = 0; j < numCols; ++j) {
1255 tmpSrc.view_host()(j) = j;
1257 if (!copyOnHost) {
1258 tmpSrc.sync_device();
1259 }
1261 } else {
1262 Teuchos::ArrayView<const size_t> srcWhichVecsT =
1263 sourceMV.whichVectors_();
1264 srcWhichVecs =
1266 "srcWhichVecs",
1267 copyOnHost);
1268 }
1269 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(srcWhichVecs.extent(0)) !=
1270 sourceMV.getNumVectors(),
1271 std::logic_error,
1272 "srcWhichVecs.extent(0) = " << srcWhichVecs.extent(0)
1273 << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors()
1274 << ".");
1275 }
1276
1277 if (copyOnHost) { // permute on host too
1278 if (verbose) {
1279 std::ostringstream os;
1280 os << *prefix << "Get permute LIDs on host" << std::endl;
1281 std::cerr << os.str();
1282 }
1283 auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1284 auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1285
1286 TEUCHOS_ASSERT(!permuteToLIDs.need_sync_host());
1288 TEUCHOS_ASSERT(!permuteFromLIDs.need_sync_host());
1289 auto permuteFromLIDs_h =
1291
1292 if (verbose) {
1293 std::ostringstream os;
1294 os << *prefix << "Permute on host" << endl;
1295 std::cerr << os.str();
1296 }
1297 if (nonConstStride) {
1298 // No need to sync first, because copyOnHost argument to
1299 // getDualViewCopyFromArrayView puts them in the right place.
1300 auto tgtWhichVecs_h =
1301 create_const_view(tgtWhichVecs.view_host());
1302 auto srcWhichVecs_h =
1303 create_const_view(srcWhichVecs.view_host());
1304 if (CM == ADD_ASSIGN) {
1305 using op_type = KokkosRefactor::Details::AddOp;
1306 permute_array_multi_column_variable_stride(tgt_h, src_h,
1310 srcWhichVecs_h, numCols,
1311 op_type());
1312 } else {
1313 using op_type = KokkosRefactor::Details::InsertOp;
1314 permute_array_multi_column_variable_stride(tgt_h, src_h,
1318 srcWhichVecs_h, numCols,
1319 op_type());
1320 }
1321 } else {
1322 if (CM == ADD_ASSIGN) {
1323 using op_type = KokkosRefactor::Details::AddOp;
1324 permute_array_multi_column(tgt_h, src_h, permuteToLIDs_h,
1325 permuteFromLIDs_h, numCols, op_type());
1326 } else {
1327 using op_type = KokkosRefactor::Details::InsertOp;
1328 permute_array_multi_column(tgt_h, src_h, permuteToLIDs_h,
1329 permuteFromLIDs_h, numCols, op_type());
1330 }
1331 }
1332 } else { // permute on device
1333 if (verbose) {
1334 std::ostringstream os;
1335 os << *prefix << "Get permute LIDs on device" << endl;
1336 std::cerr << os.str();
1337 }
1338 auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1339 auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1340
1341 TEUCHOS_ASSERT(!permuteToLIDs.need_sync_device());
1342 auto permuteToLIDs_d = create_const_view(permuteToLIDs.view_device());
1343 TEUCHOS_ASSERT(!permuteFromLIDs.need_sync_device());
1344 auto permuteFromLIDs_d =
1345 create_const_view(permuteFromLIDs.view_device());
1347 if (verbose) {
1348 std::ostringstream os;
1349 os << *prefix << "Permute on device" << endl;
1350 std::cerr << os.str();
1352 if (nonConstStride) {
1353 // No need to sync first, because copyOnHost argument to
1354 // getDualViewCopyFromArrayView puts them in the right place.
1356 auto srcWhichVecs_d = create_const_view(srcWhichVecs.view_device());
1357 if (CM == ADD_ASSIGN) {
1358 using op_type = KokkosRefactor::Details::AddOp;
1359 permute_array_multi_column_variable_stride(space, tgt_d, src_d,
1363 srcWhichVecs_d, numCols,
1364 op_type());
1365 } else {
1366 using op_type = KokkosRefactor::Details::InsertOp;
1367 permute_array_multi_column_variable_stride(space, tgt_d, src_d,
1372 op_type());
1373 }
1374 } else {
1375 if (CM == ADD_ASSIGN) {
1376 using op_type = KokkosRefactor::Details::AddOp;
1377 permute_array_multi_column(space, tgt_d, src_d, permuteToLIDs_d,
1378 permuteFromLIDs_d, numCols, op_type());
1379 } else {
1380 using op_type = KokkosRefactor::Details::InsertOp;
1381 permute_array_multi_column(space, tgt_d, src_d, permuteToLIDs_d,
1382 permuteFromLIDs_d, numCols, op_type());
1383 }
1384 }
1385 }
1386
1387 if (verbose) {
1388 std::ostringstream os;
1389 os << *prefix << "Done!" << endl;
1390 std::cerr << os.str();
1392}
1393
1394template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1396 const SrcDistObject& sourceObj, const size_t numSameIDs,
1397 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1398 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1399 const CombineMode CM) {
1400 copyAndPermute(sourceObj, numSameIDs, permuteToLIDs, permuteFromLIDs, CM,
1402}
1403
1404template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1407 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1408 Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1409 Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1410 size_t& constantNumPackets,
1411 const execution_space& space) {
1412 using Kokkos::Compat::create_const_view;
1413 using Kokkos::Compat::getKokkosViewDeepCopy;
1414 using std::endl;
1415 using ::Tpetra::Details::Behavior;
1416 using ::Tpetra::Details::ProfilingRegion;
1417 using ::Tpetra::Details::reallocDualViewIfNeeded;
1419
1420 // We've already called checkSizes(), so this cast must succeed.
1421 MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&>(sourceObj));
1422
1423 const bool packOnHost = runKernelOnHost(sourceMV);
1424 const char longFuncNameHost[] = "Tpetra::MultiVector::packAndPrepare[Host]";
1425 const char longFuncNameDevice[] = "Tpetra::MultiVector::packAndPrepare[Device]";
1426 const char tfecfFuncName[] = "packAndPrepare: ";
1428
1429 // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1430 // have the option to check indices. We do so when Tpetra is in
1431 // debug mode. It is in debug mode by default in a debug build,
1432 // but you may control this at run time, before launching the
1433 // executable, by setting the TPETRA_DEBUG environment variable to
1434 // "1" (or "TRUE").
1435 const bool debugCheckIndices = Behavior::debug();
1436 // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1437 // environment variable to "1" (or "TRUE") for copious debug
1438 // output to std::cerr on every MPI process. This is unwise for
1439 // runs with large numbers of MPI processes.
1440 const bool printDebugOutput = Behavior::verbose();
1441
1442 std::unique_ptr<std::string> prefix;
1444 auto map = this->getMap();
1445 auto comm = map.is_null() ? Teuchos::null : map->getComm();
1446 const int myRank = comm.is_null() ? -1 : comm->getRank();
1447 std::ostringstream os;
1448 os << "Proc " << myRank << ": MV::packAndPrepare: ";
1449 prefix = std::unique_ptr<std::string>(new std::string(os.str()));
1450 os << "Start" << endl;
1451 std::cerr << os.str();
1452 }
1453
1454 const size_t numCols = sourceMV.getNumVectors();
1455
1456 // This spares us from needing to fill numExportPacketsPerLID.
1457 // Setting constantNumPackets to a nonzero value signals that
1458 // all packets have the same number of entries.
1459 constantNumPackets = numCols;
1460
1461 // If we have no exports, there is nothing to do. Make sure this
1462 // goes _after_ setting constantNumPackets correctly.
1463 if (exportLIDs.extent(0) == 0) {
1464 if (printDebugOutput) {
1465 std::ostringstream os;
1466 os << *prefix << "No exports on this proc, DONE" << std::endl;
1467 std::cerr << os.str();
1468 }
1469 return;
1470 }
1471
1472 /* The layout in the export for MultiVectors is as follows:
1473 exports = { all of the data from row exportLIDs.front() ;
1474 ....
1475 all of the data from row exportLIDs.back() }
1476 This doesn't have the best locality, but is necessary because
1477 the data for a Packet (all data associated with an LID) is
1478 required to be contiguous. */
1479
1480 // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1481 // packing scheme in the above comment? The data going to a
1482 // particular process must be contiguous, of course, but those
1483 // data could include entries from multiple LIDs. DistObject just
1484 // needs to know how to index into that data. Kokkos is good at
1485 // decoupling storage intent from data layout choice.
1486
1487 const size_t numExportLIDs = exportLIDs.extent(0);
1488 const size_t newExportsSize = numCols * numExportLIDs;
1489 if (printDebugOutput) {
1490 std::ostringstream os;
1491 os << *prefix << "realloc: "
1492 << "numExportLIDs: " << numExportLIDs
1493 << ", exports.extent(0): " << exports.extent(0)
1494 << ", newExportsSize: " << newExportsSize << std::endl;
1495 std::cerr << os.str();
1496 }
1497 reallocDualViewIfNeeded(exports, newExportsSize, "exports");
1498
1499 // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1500 // sync it. Pack it where it's currently sync'd.
1501 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(sourceMV.need_sync_device() && sourceMV.need_sync_host(),
1502 std::logic_error,
1503 "Input MultiVector needs sync to both host "
1504 "and device.");
1505 if (printDebugOutput) {
1506 std::ostringstream os;
1507 os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1508 std::cerr << os.str();
1510
1511 // Mark 'exports' here, since we might have resized it above.
1512 // Resizing currently requires calling the constructor, which
1513 // clears out the 'modified' flags.
1514 if (packOnHost) {
1515 // nde 06 Feb 2020: If 'exports' does not require resize
1516 // when reallocDualViewIfNeeded is called, the modified flags
1517 // are not cleared out. This can result in host and device views
1518 // being out-of-sync, resuling in an error in exports.modify_* calls.
1519 // Clearing the sync flags prevents this possible case.
1520 exports.clear_sync_state();
1521 exports.modify_host();
1522 } else {
1523 // nde 06 Feb 2020: If 'exports' does not require resize
1524 // when reallocDualViewIfNeeded is called, the modified flags
1525 // are not cleared out. This can result in host and device views
1526 // being out-of-sync, resuling in an error in exports.modify_* calls.
1527 // Clearing the sync flags prevents this possible case.
1528 exports.clear_sync_state();
1529 exports.modify_device();
1530 }
1531
1532 if (numCols == 1) { // special case for one column only
1533 // MultiVector always represents a single column with constant
1534 // stride, but it doesn't hurt to implement both cases anyway.
1535 //
1536 // ETP: I'm not sure I agree with the above statement. Can't a single-
1537 // column multivector be a subview of another multi-vector, in which case
1538 // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1539 // separately.
1540 //
1541 // mfh 18 Jan 2016: In answer to ETP's comment above:
1542 // MultiVector treats single-column MultiVectors created using a
1543 // "nonconstant stride constructor" as a special case, and makes
1544 // them constant stride (by making whichVectors_ have length 0).
1545 if (sourceMV.isConstantStride()) {
1546 using KokkosRefactor::Details::pack_array_single_column;
1547 if (printDebugOutput) {
1548 std::ostringstream os;
1549 os << *prefix << "Pack numCols=1 const stride" << endl;
1550 std::cerr << os.str();
1551 }
1552 if (packOnHost) {
1553 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1554 pack_array_single_column(exports.view_host(),
1555 src_host,
1556 exportLIDs.view_host(),
1557 0,
1559 } else { // pack on device
1560 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1561 pack_array_single_column(exports.view_device(),
1562 src_dev,
1563 exportLIDs.view_device(),
1564 0,
1566 space);
1567 }
1568 } else {
1569 using KokkosRefactor::Details::pack_array_single_column;
1570 if (printDebugOutput) {
1571 std::ostringstream os;
1572 os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1573 std::cerr << os.str();
1574 }
1575 if (packOnHost) {
1576 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1577 pack_array_single_column(exports.view_host(),
1578 src_host,
1579 exportLIDs.view_host(),
1580 sourceMV.whichVectors_[0],
1582 } else { // pack on device
1583 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1584 pack_array_single_column(exports.view_device(),
1585 src_dev,
1586 exportLIDs.view_device(),
1587 sourceMV.whichVectors_[0],
1589 space);
1590 }
1591 }
1592 } else { // the source MultiVector has multiple columns
1593 if (sourceMV.isConstantStride()) {
1594 using KokkosRefactor::Details::pack_array_multi_column;
1595 if (printDebugOutput) {
1596 std::ostringstream os;
1597 os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1598 std::cerr << os.str();
1599 }
1600 if (packOnHost) {
1601 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1602 pack_array_multi_column(exports.view_host(),
1603 src_host,
1604 exportLIDs.view_host(),
1605 numCols,
1607 } else { // pack on device
1608 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1609 pack_array_multi_column(exports.view_device(),
1610 src_dev,
1611 exportLIDs.view_device(),
1612 numCols,
1614 space);
1616 } else {
1617 using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1619 std::ostringstream os;
1620 os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1621 << endl;
1622 std::cerr << os.str();
1623 }
1624 // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1625 // whichVectors_ can be expensive, but pack and unpack for
1626 // nonconstant-stride MultiVectors is slower anyway.
1627 using IST = impl_scalar_type;
1628 using DV = Kokkos::DualView<IST*, device_type>;
1629 using HES = typename DV::t_host::execution_space;
1630 using DES = typename DV::t_dev::execution_space;
1631 Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_();
1632 if (packOnHost) {
1633 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1634 pack_array_multi_column_variable_stride(exports.view_host(),
1635 src_host,
1636 exportLIDs.view_host(),
1638 numCols,
1640 } else { // pack on device
1641 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1642 pack_array_multi_column_variable_stride(exports.view_device(),
1643 src_dev,
1644 exportLIDs.view_device(),
1646 numCols,
1648 }
1649 }
1650 }
1651
1652 if (printDebugOutput) {
1653 std::ostringstream os;
1654 os << *prefix << "Done!" << endl;
1655 std::cerr << os.str();
1656 }
1657}
1659template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1662 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1663 Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1664 Kokkos::DualView<size_t*, buffer_device_type> numExportPacketsPerLID,
1665 size_t& constantNumPackets) {
1667}
1669template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1670template <class NO>
1671typename std::enable_if<std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1672 typename NO::device_type::memory_space>::value,
1673 bool>::type
1676 const bool verbose,
1677 const std::string* prefix,
1678 const bool areRemoteLIDsContiguous,
1680 // This implementation of reallocImportsIfNeeded is an
1681 // optimization that is specific to MultiVector. We check if the
1682 // imports_ view can be aliased to the end of the data view_. If
1683 // that is the case, we can skip the unpackAndCombine call.
1684
1685 if (verbose) {
1686 std::ostringstream os;
1687 os << *prefix << "Realloc (if needed) imports_ from "
1688 << this->imports_.extent(0) << " to " << newSize << std::endl;
1689 std::cerr << os.str();
1690 }
1691
1692 bool reallocated = false;
1693
1694 using IST = impl_scalar_type;
1695 using DV = Kokkos::DualView<IST*, Kokkos::LayoutLeft, buffer_device_type>;
1696
1697 // Conditions for aliasing memory:
1698 // - When both sides of the dual view are in the same memory
1699 // space, we do not need to worry about syncing things.
1700 // - When both memory spaces are different, we only alias if this
1701 // does not incur additional sync'ing.
1702 // - The remote LIDs need to be contiguous, so that we do not need
1703 // to reorder received information.
1704 // - CombineMode needs to be INSERT.
1705 // - The number of vectors needs to be 1, otherwise we need to
1706 // reorder the received data.
1707 if ((std::is_same_v<typename dual_view_type::t_dev::device_type, typename dual_view_type::t_host::device_type> ||
1708 (Details::Behavior::assumeMpiIsGPUAware() && !this->need_sync_device()) ||
1709 (!Details::Behavior::assumeMpiIsGPUAware() && !this->need_sync_host())) &&
1710 areRemoteLIDsContiguous &&
1711 (CM == INSERT || CM == REPLACE) &&
1712 (getNumVectors() == 1) &&
1713 (newSize > 0)) {
1714 size_t offset = getLocalLength() - newSize;
1715 reallocated = this->imports_.view_device().data() != view_.getDualView().view_device().data() + offset;
1716 if (reallocated) {
1717 typedef std::pair<size_t, size_t> range_type;
1718 this->imports_ = DV(view_.getDualView(),
1719 range_type(offset, getLocalLength()),
1720 0);
1721
1722 if (verbose) {
1723 std::ostringstream os;
1724 os << *prefix << "Aliased imports_ to MV.view_" << std::endl;
1725 std::cerr << os.str();
1726 }
1727 }
1728 return reallocated;
1729 }
1730 {
1731 using ::Tpetra::Details::reallocDualViewIfNeeded;
1732 reallocated =
1733 reallocDualViewIfNeeded(this->unaliased_imports_, newSize, "imports");
1734 if (verbose) {
1735 std::ostringstream os;
1736 os << *prefix << "Finished realloc'ing unaliased_imports_" << std::endl;
1737 std::cerr << os.str();
1738 }
1739 this->imports_ = this->unaliased_imports_;
1740 }
1741 return reallocated;
1742}
1743
1744template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1745template <class NO>
1746typename std::enable_if<!std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1747 typename NO::device_type::memory_space>::value,
1748 bool>::type
1751 const bool verbose,
1752 const std::string* prefix,
1753 const bool areRemoteLIDsContiguous,
1754 const CombineMode CM) {
1756}
1757
1758template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1760 reallocImportsIfNeeded(const size_t newSize,
1761 const bool verbose,
1762 const std::string* prefix,
1763 const bool areRemoteLIDsContiguous,
1764 const CombineMode CM) {
1766 return reallocImportsIfNeededImpl<Node>(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1767}
1768
1769template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1772 return (this->imports_.view_device().data() + this->imports_.view_device().extent(0) ==
1773 view_.getDualView().view_device().data() + view_.getDualView().view_device().extent(0));
1774}
1775
1776template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1778 unpackAndCombine(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1779 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1780 Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1781 const size_t constantNumPackets,
1782 const CombineMode CM,
1783 const execution_space& space) {
1784 using Kokkos::Compat::getKokkosViewDeepCopy;
1785 using KokkosRefactor::Details::unpack_array_multi_column;
1786 using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1787 using std::endl;
1788 using ::Tpetra::Details::Behavior;
1789 using ::Tpetra::Details::ProfilingRegion;
1790
1791 const bool unpackOnHost = runKernelOnHost(imports);
1792
1793 const char longFuncNameHost[] = "Tpetra::MultiVector::unpackAndCombine[Host]";
1794 const char longFuncNameDevice[] = "Tpetra::MultiVector::unpackAndCombine[Device]";
1796 const char tfecfFuncName[] = "unpackAndCombine: ";
1797
1798 // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1799 // have the option to check indices. We do so when Tpetra is in
1800 // debug mode. It is in debug mode by default in a debug build,
1801 // but you may control this at run time, before launching the
1802 // executable, by setting the TPETRA_DEBUG environment variable to
1803 // "1" (or "TRUE").
1804 const bool debugCheckIndices = Behavior::debug();
1805
1806 const bool printDebugOutput = Behavior::verbose();
1807 std::unique_ptr<std::string> prefix;
1808 if (printDebugOutput) {
1809 auto map = this->getMap();
1810 auto comm = map.is_null() ? Teuchos::null : map->getComm();
1811 const int myRank = comm.is_null() ? -1 : comm->getRank();
1812 std::ostringstream os;
1813 os << "Proc " << myRank << ": " << longFuncName << ": ";
1814 prefix = std::unique_ptr<std::string>(new std::string(os.str()));
1815 os << "Start" << endl;
1816 std::cerr << os.str();
1817 }
1818
1819 // If we have no imports, there is nothing to do
1820 if (importLIDs.extent(0) == 0) {
1821 if (printDebugOutput) {
1822 std::ostringstream os;
1823 os << *prefix << "No imports. Done!" << endl;
1824 std::cerr << os.str();
1825 }
1826 return;
1827 }
1828
1829 // Check, whether imports_ is a subview of the MV view.
1830 if (importsAreAliased()) {
1831 if (printDebugOutput) {
1832 std::ostringstream os;
1833 os << *prefix << "Skipping unpack, since imports_ is aliased to MV.view_. Done!" << endl;
1834 std::cerr << os.str();
1835 }
1836 return;
1838
1839 ProfilingRegion regionUAC(longFuncName);
1840
1841 const size_t numVecs = getNumVectors();
1842 if (debugCheckIndices) {
1843 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(imports.extent(0)) !=
1844 numVecs * importLIDs.extent(0),
1845 std::runtime_error,
1846 "imports.extent(0) = " << imports.extent(0)
1847 << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1848 << " * " << importLIDs.extent(0) << " = "
1849 << numVecs * importLIDs.extent(0) << ".");
1850
1851 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(constantNumPackets == static_cast<size_t>(0), std::runtime_error,
1852 "constantNumPackets input argument must be nonzero.");
1853
1854 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(numVecs) !=
1855 static_cast<size_t>(constantNumPackets),
1856 std::runtime_error, "constantNumPackets must equal numVecs.");
1857 }
1858
1859 // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1860 // the memory space in which the imports buffer was last modified and
1861 // the size of the imports buffer.
1862 // DistObject::doTransferNew decides where it was last modified (based on
1863 // whether communication buffers used were on host or device).
1864 if (unpackOnHost) {
1865 if (this->imports_.need_sync_host()) this->imports_.sync_host();
1866 } else {
1867 if (this->imports_.need_sync_device()) this->imports_.sync_device();
1868 }
1869
1870 if (printDebugOutput) {
1871 std::ostringstream os;
1872 os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1873 << endl;
1874 std::cerr << os.str();
1875 }
1876
1877 // We have to sync before modifying, because this method may read
1878 // as well as write (depending on the CombineMode).
1879 auto imports_d = imports.view_device();
1880 auto imports_h = imports.view_host();
1881 auto importLIDs_d = importLIDs.view_device();
1882 auto importLIDs_h = importLIDs.view_host();
1883
1884 Kokkos::DualView<size_t*, device_type> whichVecs;
1885 if (!isConstantStride()) {
1886 Kokkos::View<const size_t*, Kokkos::HostSpace,
1887 Kokkos::MemoryUnmanaged>
1888 whichVecsIn(whichVectors_.getRawPtr(),
1889 numVecs);
1890 whichVecs = Kokkos::DualView<size_t*, device_type>("whichVecs", numVecs);
1891 if (unpackOnHost) {
1892 whichVecs.modify_host();
1893 // DEEP_COPY REVIEW - NOT TESTED FOR CUDA BUILD
1894 Kokkos::deep_copy(whichVecs.view_host(), whichVecsIn);
1895 } else {
1896 whichVecs.modify_device();
1897 // DEEP_COPY REVIEW - HOST-TO-DEVICE
1898 Kokkos::deep_copy(whichVecs.view_device(), whichVecsIn);
1899 }
1900 }
1901 auto whichVecs_d = whichVecs.view_device();
1902 auto whichVecs_h = whichVecs.view_host();
1903
1904 /* The layout in the export for MultiVectors is as follows:
1905 imports = { all of the data from row exportLIDs.front() ;
1906 ....
1907 all of the data from row exportLIDs.back() }
1908 This doesn't have the best locality, but is necessary because
1909 the data for a Packet (all data associated with an LID) is
1910 required to be contiguous. */
1911
1912 if (numVecs > 0 && importLIDs.extent(0) > 0) {
1913 using dev_exec_space = typename dual_view_type::t_dev::execution_space;
1914 using host_exec_space = typename dual_view_type::t_host::execution_space;
1915
1916 // This fixes GitHub Issue #4418.
1917 const bool use_atomic_updates = unpackOnHost ? host_exec_space().concurrency() != 1 : dev_exec_space().concurrency() != 1;
1918
1919 if (printDebugOutput) {
1920 std::ostringstream os;
1921 os << *prefix << "Unpack: " << combineModeToString(CM) << endl;
1922 std::cerr << os.str();
1923 }
1924
1925 // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
1926 // custom combine modes, start editing here.
1927
1928 if (CM == INSERT || CM == REPLACE) {
1929 using op_type = KokkosRefactor::Details::InsertOp;
1930 if (isConstantStride()) {
1931 if (unpackOnHost) {
1932 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1933 unpack_array_multi_column(host_exec_space(),
1935 op_type(), numVecs,
1938
1939 } else { // unpack on device
1940 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1941 unpack_array_multi_column(space,
1943 op_type(), numVecs,
1946 }
1947 } else { // not constant stride
1948 if (unpackOnHost) {
1949 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1950 unpack_array_multi_column_variable_stride(host_exec_space(),
1951 X_h, imports_h,
1954 op_type(),
1958 } else { // unpack on device
1959 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1960 unpack_array_multi_column_variable_stride(space,
1961 X_d, imports_d,
1964 op_type(),
1965 numVecs,
1968 }
1969 }
1970 } else if (CM == ADD || CM == ADD_ASSIGN) {
1971 using op_type = KokkosRefactor::Details::AddOp;
1972 if (isConstantStride()) {
1973 if (unpackOnHost) {
1974 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1975 unpack_array_multi_column(host_exec_space(),
1977 op_type(), numVecs,
1980 } else { // unpack on device
1981 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1982 unpack_array_multi_column(space,
1984 op_type(), numVecs,
1987 }
1988 } else { // not constant stride
1989 if (unpackOnHost) {
1990 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1991 unpack_array_multi_column_variable_stride(host_exec_space(),
1992 X_h, imports_h,
1995 op_type(),
1996 numVecs,
1999 } else { // unpack on device
2000 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2001 unpack_array_multi_column_variable_stride(space,
2002 X_d, imports_d,
2005 op_type(),
2006 numVecs,
2009 }
2010 }
2011 } else if (CM == ABSMAX) {
2012 using op_type = KokkosRefactor::Details::AbsMaxOp;
2013 if (isConstantStride()) {
2014 if (unpackOnHost) {
2015 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2016 unpack_array_multi_column(host_exec_space(),
2018 op_type(), numVecs,
2021 } else { // unpack on device
2022 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2023 unpack_array_multi_column(space,
2025 op_type(), numVecs,
2028 }
2029 } else {
2030 if (unpackOnHost) {
2031 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2032 unpack_array_multi_column_variable_stride(host_exec_space(),
2033 X_h, imports_h,
2036 op_type(),
2037 numVecs,
2040 } else { // unpack on device
2041 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2042 unpack_array_multi_column_variable_stride(space,
2043 X_d, imports_d,
2046 op_type(),
2047 numVecs,
2050 }
2051 }
2052 } else {
2053 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, "Invalid CombineMode");
2054 }
2055 } else {
2056 if (printDebugOutput) {
2057 std::ostringstream os;
2058 os << *prefix << "Nothing to unpack" << endl;
2059 std::cerr << os.str();
2060 }
2061 }
2062
2063 if (printDebugOutput) {
2064 std::ostringstream os;
2065 os << *prefix << "Done!" << endl;
2066 std::cerr << os.str();
2067 }
2068}
2069
2070template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2072 unpackAndCombine(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
2073 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
2074 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
2076 const CombineMode CM) {
2077 unpackAndCombine(importLIDs, imports, numPacketsPerLID, constantNumPackets, CM, execution_space());
2078}
2079
2080template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2081size_t
2083 getNumVectors() const {
2084 if (isConstantStride()) {
2085 return static_cast<size_t>(view_.extent(1));
2086 } else {
2087 return static_cast<size_t>(whichVectors_.size());
2088 }
2089}
2090
2091namespace { // (anonymous)
2092
2093template <class RV>
2095 const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2096 const bool distributed) {
2097 using Teuchos::REDUCE_MAX;
2098 using Teuchos::REDUCE_SUM;
2099 using Teuchos::reduceAll;
2100 typedef typename RV::non_const_value_type dot_type;
2101
2102 const size_t numVecs = dotsOut.extent(0);
2103
2104 // If the MultiVector is distributed over multiple processes, do
2105 // the distributed (interprocess) part of the dot product. We
2106 // assume that the MPI implementation can read from and write to
2107 // device memory.
2108 //
2109 // replaceMap() may have removed some processes. Those
2110 // processes have a null Map. They must not participate in any
2111 // collective operations. We ask first whether the Map is null,
2112 // because isDistributed() defers that question to the Map. We
2113 // still compute and return local dots for processes not
2114 // participating in collective operations; those probably don't
2115 // make any sense, but it doesn't hurt to do them, since it's
2116 // illegal to call dot() on those processes anyway.
2117 if (distributed && !comm.is_null()) {
2118 // The calling process only participates in the collective if
2119 // both the Map and its Comm on that process are nonnull.
2120 const int nv = static_cast<int>(numVecs);
2122
2123 if (commIsInterComm) {
2124 // If comm is an intercomm, then we may not alias input and
2125 // output buffers, so we have to make a copy of the local
2126 // sum.
2127 typename RV::non_const_type lclDots(Kokkos::ViewAllocateWithoutInitializing("tmp"), numVecs);
2128 // DEEP_COPY REVIEW - NOT TESTED
2129 Kokkos::deep_copy(lclDots, dotsOut);
2130 const dot_type* const lclSum = lclDots.data();
2131 dot_type* const gblSum = dotsOut.data();
2133 } else {
2134 dot_type* const inout = dotsOut.data();
2136 }
2137 }
2138}
2139} // namespace
2140
2141template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2144 const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const {
2145 using Kokkos::subview;
2146 using Teuchos::Comm;
2147 using Teuchos::null;
2148 using Teuchos::RCP;
2149 using ::Tpetra::Details::Behavior;
2150 // View of all the dot product results.
2151 typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
2152 typedef typename dual_view_type::t_dev_const XMV;
2153 const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
2154
2155 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::dot (Kokkos::View)");
2156
2157 const size_t numVecs = this->getNumVectors();
2158 if (numVecs == 0) {
2159 return; // nothing to do
2160 }
2161 const size_t lclNumRows = this->getLocalLength();
2162 const size_t numDots = static_cast<size_t>(dots.extent(0));
2163 const bool debug = Behavior::debug();
2164
2165 if (debug) {
2166 const bool compat = this->getMap()->isCompatible(*(A.getMap()));
2167 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!compat, std::invalid_argument,
2168 "'*this' MultiVector is not "
2169 "compatible with the input MultiVector A. We only test for this "
2170 "in debug mode.");
2171 }
2172
2173 // FIXME (mfh 11 Jul 2014) These exception tests may not
2174 // necessarily be thrown on all processes consistently. We should
2175 // instead pass along error state with the inner product. We
2176 // could do this by setting an extra slot to
2177 // Kokkos::ArithTraits<dot_type>::one() on error. The
2178 // final sum should be
2179 // Kokkos::ArithTraits<dot_type>::zero() if not error.
2181 lclNumRows != A.getLocalLength(), std::runtime_error,
2182 "MultiVectors do not have the same local length. "
2183 "this->getLocalLength() = "
2184 << lclNumRows << " != "
2185 "A.getLocalLength() = "
2186 << A.getLocalLength() << ".");
2188 numVecs != A.getNumVectors(), std::runtime_error,
2189 "MultiVectors must have the same number of columns (vectors). "
2190 "this->getNumVectors() = "
2191 << numVecs << " != "
2192 "A.getNumVectors() = "
2193 << A.getNumVectors() << ".");
2195 numDots != numVecs, std::runtime_error,
2196 "The output array 'dots' must have the same number of entries as the "
2197 "number of columns (vectors) in *this and A. dots.extent(0) = "
2198 << numDots << " != this->getNumVectors() = " << numVecs << ".");
2199
2200 const std::pair<size_t, size_t> colRng(0, numVecs);
2202 RCP<const Comm<int> > comm = this->getMap().is_null() ? null : this->getMap()->getComm();
2203
2204 auto thisView = this->getLocalViewDevice(Access::ReadOnly);
2205 auto A_view = A.getLocalViewDevice(Access::ReadOnly);
2206
2207 ::Tpetra::Details::lclDot<RV, XMV>(dotsOut, thisView, A_view, lclNumRows, numVecs,
2208 this->whichVectors_.getRawPtr(),
2209 A.whichVectors_.getRawPtr(),
2210 this->isConstantStride(), A.isConstantStride());
2211
2212 // lbv 15 mar 2023: Kokkos Kernels provides non-blocking BLAS
2213 // functions unless they explicitely return a value to Host.
2214 // Here while the lclDot are on host, they are not a return
2215 // value, therefore they might be avaible to us immediately.
2216 // Adding a frnce here guarantees that we will have the lclDot
2217 // ahead of the MPI reduction.
2219 exec_space_instance.fence("Tpetra::MultiVector::dot");
2220
2221 gblDotImpl(dotsOut, comm, this->isDistributed());
2222}
2223
2224namespace { // (anonymous)
2225template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2229 using ::Tpetra::Details::ProfilingRegion;
2231 using dot_type = typename MV::dot_type;
2232 ProfilingRegion region("Tpetra::multiVectorSingleColumnDot");
2233
2234 auto map = x.getMap();
2235 Teuchos::RCP<const Teuchos::Comm<int> > comm =
2236 map.is_null() ? Teuchos::null : map->getComm();
2237 if (comm.is_null()) {
2238 return KokkosKernels::ArithTraits<dot_type>::zero();
2239 } else {
2240 using LO = LocalOrdinal;
2241 // The min just ensures that we don't overwrite memory that
2242 // doesn't belong to us, in the erroneous input case where x
2243 // and y have different numbers of rows.
2244 const LO lclNumRows = static_cast<LO>(std::min(x.getLocalLength(),
2245 y.getLocalLength()));
2246 const Kokkos::pair<LO, LO> rowRng(0, lclNumRows);
2247 dot_type lclDot = KokkosKernels::ArithTraits<dot_type>::zero();
2248 dot_type gblDot = KokkosKernels::ArithTraits<dot_type>::zero();
2250 // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2251 // const_cast<MV&>(x).sync_device ();
2252 // const_cast<MV&>(y).sync_device ();
2253
2254 auto x_2d = x.getLocalViewDevice(Access::ReadOnly);
2255 auto x_1d = Kokkos::subview(x_2d, rowRng, 0);
2256 auto y_2d = y.getLocalViewDevice(Access::ReadOnly);
2257 auto y_1d = Kokkos::subview(y_2d, rowRng, 0);
2258 lclDot = KokkosBlas::dot(x_1d, y_1d);
2259
2260 if (x.isDistributed()) {
2261 using Teuchos::outArg;
2262 using Teuchos::REDUCE_SUM;
2263 using Teuchos::reduceAll;
2265 } else {
2266 gblDot = lclDot;
2267 }
2268 return gblDot;
2269 }
2270}
2271} // namespace
2272
2273template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2276 const Teuchos::ArrayView<dot_type>& dots) const {
2277 const char tfecfFuncName[] = "dot: ";
2278 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::dot (Teuchos::ArrayView)");
2279
2280 const size_t numVecs = this->getNumVectors();
2281 const size_t lclNumRows = this->getLocalLength();
2282 const size_t numDots = static_cast<size_t>(dots.size());
2283
2284 // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2285 // not necessarily be thrown on all processes consistently. We
2286 // keep them for now, because MultiVector's unit tests insist on
2287 // them. In the future, we should instead pass along error state
2288 // with the inner product. We could do this by setting an extra
2289 // slot to Kokkos::ArithTraits<dot_type>::one() on error.
2290 // The final sum should be
2291 // Kokkos::ArithTraits<dot_type>::zero() if not error.
2292 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclNumRows != A.getLocalLength(), std::runtime_error,
2293 "MultiVectors do not have the same local length. "
2294 "this->getLocalLength() = "
2295 << lclNumRows << " != "
2296 "A.getLocalLength() = "
2297 << A.getLocalLength() << ".");
2298 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numVecs != A.getNumVectors(), std::runtime_error,
2299 "MultiVectors must have the same number of columns (vectors). "
2300 "this->getNumVectors() = "
2301 << numVecs << " != "
2302 "A.getNumVectors() = "
2303 << A.getNumVectors() << ".");
2305 "The output array 'dots' must have the same number of entries as the "
2306 "number of columns (vectors) in *this and A. dots.extent(0) = "
2307 << numDots << " != this->getNumVectors() = " << numVecs << ".");
2308
2309 if (numVecs == 1 && this->isConstantStride() && A.isConstantStride()) {
2311 dots[0] = gblDot;
2312 } else {
2313 this->dot(A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr(), numDots));
2314 }
2315}
2316
2317template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2319 norm2(const Teuchos::ArrayView<mag_type>& norms) const {
2321 using ::Tpetra::Details::NORM_TWO;
2322 using ::Tpetra::Details::ProfilingRegion;
2323 ProfilingRegion region("Tpetra::MV::norm2 (host output)");
2324
2325 // The function needs to be able to sync X.
2326 MV& X = const_cast<MV&>(*this);
2327 multiVectorNormImpl(norms.getRawPtr(), X, NORM_TWO);
2328}
2329
2330template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2332 norm2(const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const {
2333 Teuchos::ArrayView<mag_type> norms_av(norms.data(), norms.extent(0));
2334 this->norm2(norms_av);
2335}
2336
2337template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2339 norm1(const Teuchos::ArrayView<mag_type>& norms) const {
2341 using ::Tpetra::Details::NORM_ONE;
2342 using ::Tpetra::Details::ProfilingRegion;
2343 ProfilingRegion region("Tpetra::MV::norm1 (host output)");
2344
2345 // The function needs to be able to sync X.
2346 MV& X = const_cast<MV&>(*this);
2347 multiVectorNormImpl(norms.data(), X, NORM_ONE);
2348}
2349
2350template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2352 norm1(const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const {
2353 Teuchos::ArrayView<mag_type> norms_av(norms.data(), norms.extent(0));
2354 this->norm1(norms_av);
2355}
2356
2357template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2359 normInf(const Teuchos::ArrayView<mag_type>& norms) const {
2361 using ::Tpetra::Details::NORM_INF;
2362 using ::Tpetra::Details::ProfilingRegion;
2363 ProfilingRegion region("Tpetra::MV::normInf (host output)");
2364
2365 // The function needs to be able to sync X.
2366 MV& X = const_cast<MV&>(*this);
2367 multiVectorNormImpl(norms.getRawPtr(), X, NORM_INF);
2368}
2369
2370template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2372 normInf(const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const {
2373 Teuchos::ArrayView<mag_type> norms_av(norms.data(), norms.extent(0));
2374 this->normInf(norms_av);
2375}
2376
2377template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2379 meanValue(const Teuchos::ArrayView<impl_scalar_type>& means) const {
2380 // KR FIXME Overload this method to take a View.
2381 using Kokkos::ALL;
2382 using Kokkos::subview;
2383 using Teuchos::Comm;
2384 using Teuchos::RCP;
2385 using Teuchos::REDUCE_SUM;
2386 using Teuchos::reduceAll;
2387 typedef KokkosKernels::ArithTraits<impl_scalar_type> ATS;
2388
2389 const size_t lclNumRows = this->getLocalLength();
2390 const size_t numVecs = this->getNumVectors();
2391 const size_t numMeans = static_cast<size_t>(means.size());
2392
2394 numMeans != numVecs, std::runtime_error,
2395 "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2396 << " != this->getNumVectors() = " << numVecs << ".");
2397
2398 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2399 const std::pair<size_t, size_t> colRng(0, numVecs);
2400
2401 // Make sure that the final output view has the same layout as the
2402 // temporary view's host_mirror_type. Left or Right doesn't matter for
2403 // a 1-D array anyway; this is just to placate the compiler.
2404 typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2405 typedef Kokkos::View<impl_scalar_type*,
2406 typename local_view_type::host_mirror_type::array_layout,
2407 Kokkos::HostSpace,
2408 Kokkos::MemoryTraits<Kokkos::Unmanaged> >
2411
2412 RCP<const Comm<int> > comm = this->getMap().is_null() ? Teuchos::null : this->getMap()->getComm();
2413
2414 // If we need sync to device, then host has the most recent version.
2415 const bool useHostVersion = this->need_sync_device();
2416 if (useHostVersion) {
2417 // DualView was last modified on host, so run the local kernel there.
2418 auto X_lcl = subview(getLocalViewHost(Access::ReadOnly),
2419 rowRng, Kokkos::ALL());
2420 // Compute the local sum of each column.
2421 Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums("MV::meanValue tmp", numVecs);
2422 if (isConstantStride()) {
2423 KokkosBlas::sum(lclSums, X_lcl);
2424 } else {
2425 for (size_t j = 0; j < numVecs; ++j) {
2426 const size_t col = whichVectors_[j];
2427 KokkosBlas::sum(subview(lclSums, j), subview(X_lcl, ALL(), col));
2428 }
2429 }
2430
2431 // If there are multiple MPI processes, the all-reduce reads
2432 // from lclSums, and writes to meansOut. Otherwise, we just
2433 // copy lclSums into meansOut.
2434 if (!comm.is_null() && this->isDistributed()) {
2435 reduceAll(*comm, REDUCE_SUM, static_cast<int>(numVecs),
2436 lclSums.data(), meansOut.data());
2437 } else {
2438 // DEEP_COPY REVIEW - NOT TESTED
2439 Kokkos::deep_copy(meansOut, lclSums);
2440 }
2441 } else {
2442 // DualView was last modified on device, so run the local kernel there.
2443 auto X_lcl = subview(this->getLocalViewDevice(Access::ReadOnly),
2444 rowRng, Kokkos::ALL());
2445
2446 // Compute the local sum of each column.
2447 Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums("MV::meanValue tmp", numVecs);
2448 if (isConstantStride()) {
2449 KokkosBlas::sum(lclSums, X_lcl);
2450 } else {
2451 for (size_t j = 0; j < numVecs; ++j) {
2452 const size_t col = whichVectors_[j];
2453 KokkosBlas::sum(subview(lclSums, j), subview(X_lcl, ALL(), col));
2454 }
2455 }
2456 // lbv 10 mar 2023: Kokkos Kernels provides non-blocking BLAS
2457 // functions unless they explicitly return a value to Host.
2458 // Here while the lclSums are on the host, they are not a return
2459 // value, therefore they might be available to us immediately.
2460 // Adding a fence here guarantees that we will have the lclSums
2461 // ahead of the MPI reduction.
2463 exec_space_instance.fence("Tpetra::MultiVector::mean");
2464
2465 // If there are multiple MPI processes, the all-reduce reads
2466 // from lclSums, and writes to meansOut. (We assume that MPI
2467 // can read device memory.) Otherwise, we just copy lclSums
2468 // into meansOut.
2469 if (!comm.is_null() && this->isDistributed()) {
2470 reduceAll(*comm, REDUCE_SUM, static_cast<int>(numVecs),
2471 lclSums.data(), meansOut.data());
2472 } else {
2473 // DEEP_COPY REVIEW - HOST-TO-HOST - NOT TESTED FOR MPI BUILD
2474 Kokkos::deep_copy(meansOut, lclSums);
2475 }
2476 }
2477
2478 // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2479 // to the magnitude type, since operator/ (std::complex<T>, int)
2480 // isn't necessarily defined.
2482 ATS::one() / static_cast<mag_type>(this->getGlobalLength());
2483 for (size_t k = 0; k < numMeans; ++k) {
2485 }
2486}
2487
2488template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2490 randomize() {
2491 typedef impl_scalar_type IST;
2492 typedef KokkosKernels::ArithTraits<IST> ATS;
2493 typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2494 typedef typename pool_type::generator_type generator_type;
2495
2496 const IST max = Kokkos::rand<generator_type, IST>::max();
2497 const IST min = ATS::is_signed ? IST(-max) : ATS::zero();
2498
2499 this->randomize(static_cast<Scalar>(min), static_cast<Scalar>(max));
2500}
2501
2502template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2504 randomize(const Scalar& minVal, const Scalar& maxVal) {
2505 typedef impl_scalar_type IST;
2506 typedef Tpetra::Details::Static_Random_XorShift64_Pool<typename device_type::execution_space> tpetra_pool_type;
2507 typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2508
2509 // Seed the pool based on the system RNG and the MPI rank, if needed
2510 if (!tpetra_pool_type::isSet())
2511 tpetra_pool_type::resetPool(this->getMap()->getComm()->getRank());
2512
2513 pool_type& rand_pool = tpetra_pool_type::getPool();
2514 const IST max = static_cast<IST>(maxVal);
2515 const IST min = static_cast<IST>(minVal);
2516
2517 auto thisView = this->getLocalViewDevice(Access::OverwriteAll);
2518
2519 if (isConstantStride()) {
2520 Kokkos::fill_random(thisView, rand_pool, min, max);
2521 } else {
2522 const size_t numVecs = getNumVectors();
2523 for (size_t k = 0; k < numVecs; ++k) {
2524 const size_t col = whichVectors_[k];
2525 auto X_k = Kokkos::subview(thisView, Kokkos::ALL(), col);
2526 Kokkos::fill_random(X_k, rand_pool, min, max);
2527 }
2528 }
2529}
2530
2531template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2533 putScalar(const Scalar& alpha) {
2534 using ::Tpetra::Details::ProfilingRegion;
2535 using ::Tpetra::Details::Blas::fill;
2536 using DES = typename dual_view_type::t_dev::execution_space;
2537 using HES = typename dual_view_type::t_host::execution_space;
2538 using LO = LocalOrdinal;
2539 ProfilingRegion region("Tpetra::MultiVector::putScalar");
2540
2541 // We need this cast for cases like Scalar = std::complex<T> but
2542 // impl_scalar_type = Kokkos::complex<T>.
2543 const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
2544 const LO lclNumRows = static_cast<LO>(this->getLocalLength());
2545 const LO numVecs = static_cast<LO>(this->getNumVectors());
2546
2547 // Modify the most recently updated version of the data. This
2548 // avoids sync'ing, which could violate users' expectations.
2549 //
2550 // If we need sync to device, then host has the most recent version.
2551 const bool runOnHost = runKernelOnHost(*this);
2552 if (!runOnHost) {
2553 auto X = this->getLocalViewDevice(Access::OverwriteAll);
2554 if (this->isConstantStride()) {
2555 fill(DES(), X, theAlpha, lclNumRows, numVecs);
2556 } else {
2557 fill(DES(), X, theAlpha, lclNumRows, numVecs,
2558 this->whichVectors_.getRawPtr());
2559 }
2560 } else { // last modified in host memory, so modify data there.
2561 auto X = this->getLocalViewHost(Access::OverwriteAll);
2562 if (this->isConstantStride()) {
2563 fill(HES(), X, theAlpha, lclNumRows, numVecs);
2564 } else {
2565 fill(HES(), X, theAlpha, lclNumRows, numVecs,
2566 this->whichVectors_.getRawPtr());
2567 }
2568 }
2569}
2570
2571template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2573 replaceMap(const Teuchos::RCP<const map_type>& newMap) {
2574 using Teuchos::ArrayRCP;
2575 using Teuchos::Comm;
2576 using Teuchos::RCP;
2577 using ST = Scalar;
2578 using LO = LocalOrdinal;
2579 using GO = GlobalOrdinal;
2580
2581 // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2582 // it might work if the MV is a column view of another MV.
2583 // However, things might go wrong when restoring the original
2584 // Map, so we don't allow this case for now.
2586 !this->isConstantStride(), std::logic_error,
2587 "Tpetra::MultiVector::replaceMap: This method does not currently work "
2588 "if the MultiVector is a column view of another MultiVector (that is, if "
2589 "isConstantStride() == false).");
2590
2591 // Case 1: current Map and new Map are both nonnull on this process.
2592 // Case 2: current Map is nonnull, new Map is null.
2593 // Case 3: current Map is null, new Map is nonnull.
2594 // Case 4: both Maps are null: forbidden.
2595 //
2596 // Case 1 means that we don't have to do anything on this process,
2597 // other than assign the new Map. (We always have to do that.)
2598 // It's an error for the user to supply a Map that requires
2599 // resizing in this case.
2600 //
2601 // Case 2 means that the calling process is in the current Map's
2602 // communicator, but will be excluded from the new Map's
2603 // communicator. We don't have to do anything on the calling
2604 // process; just leave whatever data it may have alone.
2605 //
2606 // Case 3 means that the calling process is excluded from the
2607 // current Map's communicator, but will be included in the new
2608 // Map's communicator. This means we need to (re)allocate the
2609 // local DualView if it does not have the right number of rows.
2610 // If the new number of rows is nonzero, we'll fill the newly
2611 // allocated local data with zeros, as befits a projection
2612 // operation.
2613 //
2614 // The typical use case for Case 3 is that the MultiVector was
2615 // first created with the Map with more processes, then that Map
2616 // was replaced with a Map with fewer processes, and finally the
2617 // original Map was restored on this call to replaceMap.
2618
2619 if (this->getMap().is_null()) { // current Map is null
2620 // If this->getMap() is null, that means that this MultiVector
2621 // has already had replaceMap happen to it. In that case, just
2622 // reallocate the DualView with the right size.
2623
2625 newMap.is_null(), std::invalid_argument,
2626 "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2627 "This probably means that the input Map is incorrect.");
2628
2629 // Case 3: current Map is null, new Map is nonnull.
2630 // Reallocate the DualView with the right dimensions.
2631 const size_t newNumRows = newMap->getLocalNumElements();
2632 const size_t origNumRows = view_.extent(0);
2633 const size_t numCols = this->getNumVectors();
2634
2635 if (origNumRows != newNumRows || view_.extent(1) != numCols) {
2637 }
2638 } else if (newMap.is_null()) { // Case 2: current Map is nonnull, new Map is null
2639 // I am an excluded process. Reinitialize my data so that I
2640 // have 0 rows. Keep the number of columns as before.
2641 const size_t newNumRows = static_cast<size_t>(0);
2642 const size_t numCols = this->getNumVectors();
2644 }
2645
2646 this->map_ = newMap;
2647}
2648
2649template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2651 scale(const Scalar& alpha) {
2652 using Kokkos::ALL;
2653 using IST = impl_scalar_type;
2654
2655 const IST theAlpha = static_cast<IST>(alpha);
2656 if (theAlpha == KokkosKernels::ArithTraits<IST>::one()) {
2657 return; // do nothing
2658 }
2659 const size_t lclNumRows = getLocalLength();
2660 const size_t numVecs = getNumVectors();
2661 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2662 const std::pair<size_t, size_t> colRng(0, numVecs);
2663
2664 // We can't substitute putScalar(0.0) for scale(0.0), because the
2665 // former will overwrite NaNs present in the MultiVector. The
2666 // semantics of this call require multiplying them by 0, which
2667 // IEEE 754 requires to be NaN.
2668
2669 // If we need sync to device, then host has the most recent version.
2670 const bool useHostVersion = need_sync_device();
2671 if (useHostVersion) {
2672 auto Y_lcl = Kokkos::subview(getLocalViewHost(Access::ReadWrite), rowRng, ALL());
2673 if (isConstantStride()) {
2674 KokkosBlas::scal(Y_lcl, theAlpha, Y_lcl);
2675 } else {
2676 for (size_t k = 0; k < numVecs; ++k) {
2677 const size_t Y_col = whichVectors_[k];
2678 auto Y_k = Kokkos::subview(Y_lcl, ALL(), Y_col);
2679 KokkosBlas::scal(Y_k, theAlpha, Y_k);
2680 }
2681 }
2682 } else { // work on device
2683 auto Y_lcl = Kokkos::subview(getLocalViewDevice(Access::ReadWrite), rowRng, ALL());
2684 if (isConstantStride()) {
2685 KokkosBlas::scal(Y_lcl, theAlpha, Y_lcl);
2686 } else {
2687 for (size_t k = 0; k < numVecs; ++k) {
2688 const size_t Y_col = isConstantStride() ? k : whichVectors_[k];
2689 auto Y_k = Kokkos::subview(Y_lcl, ALL(), Y_col);
2690 KokkosBlas::scal(Y_k, theAlpha, Y_k);
2691 }
2692 }
2693 }
2694}
2695
2696template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2698 scale(const Teuchos::ArrayView<const Scalar>& alphas) {
2699 const size_t numVecs = this->getNumVectors();
2700 const size_t numAlphas = static_cast<size_t>(alphas.size());
2702 numAlphas != numVecs, std::invalid_argument,
2703 "Tpetra::MultiVector::"
2704 "scale: alphas.size() = "
2705 << numAlphas << " != this->getNumVectors() = "
2706 << numVecs << ".");
2707
2708 // Use a DualView to copy the scaling constants onto the device.
2709 using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2710 k_alphas_type k_alphas("alphas::tmp", numAlphas);
2711 k_alphas.modify_host();
2712 for (size_t i = 0; i < numAlphas; ++i) {
2713 k_alphas.view_host()(i) = static_cast<impl_scalar_type>(alphas[i]);
2714 }
2715 k_alphas.sync_device();
2716 // Invoke the scale() overload that takes a device View of coefficients.
2717 this->scale(k_alphas.view_device());
2718}
2719
2720template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2722 scale(const Kokkos::View<const impl_scalar_type*, device_type>& alphas) {
2723 using Kokkos::ALL;
2724 using Kokkos::subview;
2725
2726 const size_t lclNumRows = this->getLocalLength();
2727 const size_t numVecs = this->getNumVectors();
2729 static_cast<size_t>(alphas.extent(0)) != numVecs,
2730 std::invalid_argument,
2731 "Tpetra::MultiVector::scale(alphas): "
2732 "alphas.extent(0) = "
2733 << alphas.extent(0)
2734 << " != this->getNumVectors () = " << numVecs << ".");
2735 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2736 const std::pair<size_t, size_t> colRng(0, numVecs);
2737
2738 // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2739 // type of the return value of subview. This is because if we
2740 // switch the array layout from LayoutLeft to LayoutRight
2741 // (preferred for performance of block operations), the types
2742 // below won't be valid. (A view of a column of a LayoutRight
2743 // multivector has LayoutStride, not LayoutLeft.)
2744
2745 // If we need sync to device, then host has the most recent version.
2746 const bool useHostVersion = this->need_sync_device();
2747 if (useHostVersion) {
2748 // Work in host memory. This means we need to create a host
2749 // mirror of the input View of coefficients.
2750 auto alphas_h = Kokkos::create_mirror_view(alphas);
2751 // DEEP_COPY REVIEW - NOT TESTED
2752 Kokkos::deep_copy(alphas_h, alphas);
2753
2754 auto Y_lcl = subview(this->getLocalViewHost(Access::ReadWrite), rowRng, ALL());
2755 if (isConstantStride()) {
2756 KokkosBlas::scal(Y_lcl, alphas_h, Y_lcl);
2757 } else {
2758 for (size_t k = 0; k < numVecs; ++k) {
2759 const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2760 auto Y_k = subview(Y_lcl, ALL(), Y_col);
2761 // We don't have to use the entire 1-D View here; we can use
2762 // the version that takes a scalar coefficient.
2763 KokkosBlas::scal(Y_k, alphas_h(k), Y_k);
2764 }
2765 }
2766 } else { // Work in device memory, using the input View 'alphas' directly.
2767 auto Y_lcl = subview(this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL());
2768 if (isConstantStride()) {
2769 KokkosBlas::scal(Y_lcl, alphas, Y_lcl);
2770 } else {
2771 // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2772 // as values on host, so copy them to host. Another approach
2773 // would be to fix scal() so that it takes a 0-D View as the
2774 // second argument.
2775 auto alphas_h = Kokkos::create_mirror_view(alphas);
2776 // DEEP_COPY REVIEW - NOT TESTED
2777 Kokkos::deep_copy(alphas_h, alphas);
2778
2779 for (size_t k = 0; k < numVecs; ++k) {
2780 const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2781 auto Y_k = subview(Y_lcl, ALL(), Y_col);
2782 KokkosBlas::scal(Y_k, alphas_h(k), Y_k);
2783 }
2784 }
2785 }
2786}
2787
2788template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2790 scale(const Scalar& alpha,
2792 using Kokkos::ALL;
2793 using Kokkos::subview;
2794 const char tfecfFuncName[] = "scale: ";
2795
2796 const size_t lclNumRows = getLocalLength();
2797 const size_t numVecs = getNumVectors();
2798
2800 lclNumRows != A.getLocalLength(), std::invalid_argument,
2801 "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2802 << A.getLocalLength() << ".");
2804 numVecs != A.getNumVectors(), std::invalid_argument,
2805 "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2806 << A.getNumVectors() << ".");
2807
2808 const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
2809 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2810 const std::pair<size_t, size_t> colRng(0, numVecs);
2811
2812 auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2813 auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2814 auto Y_lcl = subview(Y_lcl_orig, rowRng, ALL());
2815 auto X_lcl = subview(X_lcl_orig, rowRng, ALL());
2816
2817 if (isConstantStride() && A.isConstantStride()) {
2818 KokkosBlas::scal(Y_lcl, theAlpha, X_lcl);
2819 } else {
2820 // Make sure that Kokkos only uses the local length for add.
2821 for (size_t k = 0; k < numVecs; ++k) {
2822 const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2823 const size_t X_col = A.isConstantStride() ? k : A.whichVectors_[k];
2824 auto Y_k = subview(Y_lcl, ALL(), Y_col);
2825 auto X_k = subview(X_lcl, ALL(), X_col);
2826
2827 KokkosBlas::scal(Y_k, theAlpha, X_k);
2828 }
2829 }
2830}
2831
2832template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2835 const char tfecfFuncName[] = "reciprocal: ";
2836
2838 getLocalLength() != A.getLocalLength(), std::runtime_error,
2839 "MultiVectors do not have the same local length. "
2840 "this->getLocalLength() = "
2841 << getLocalLength()
2842 << " != A.getLocalLength() = " << A.getLocalLength() << ".");
2844 A.getNumVectors() != this->getNumVectors(), std::runtime_error,
2845 ": MultiVectors do not have the same number of columns (vectors). "
2846 "this->getNumVectors() = "
2847 << getNumVectors()
2848 << " != A.getNumVectors() = " << A.getNumVectors() << ".");
2849
2850 const size_t numVecs = getNumVectors();
2851
2852 auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
2853 auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
2854
2855 if (isConstantStride() && A.isConstantStride()) {
2856 KokkosBlas::reciprocal(this_view_dev, A_view_dev);
2857 } else {
2858 using Kokkos::ALL;
2859 using Kokkos::subview;
2860 for (size_t k = 0; k < numVecs; ++k) {
2861 const size_t this_col = isConstantStride() ? k : whichVectors_[k];
2863 const size_t A_col = isConstantStride() ? k : A.whichVectors_[k];
2864 auto vector_Ak = subview(A_view_dev, ALL(), A_col);
2865 KokkosBlas::reciprocal(vector_k, vector_Ak);
2866 }
2867 }
2868}
2869
2870template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2873 const char tfecfFuncName[] = "abs";
2874
2876 getLocalLength() != A.getLocalLength(), std::runtime_error,
2877 ": MultiVectors do not have the same local length. "
2878 "this->getLocalLength() = "
2879 << getLocalLength()
2880 << " != A.getLocalLength() = " << A.getLocalLength() << ".");
2882 A.getNumVectors() != this->getNumVectors(), std::runtime_error,
2883 ": MultiVectors do not have the same number of columns (vectors). "
2884 "this->getNumVectors() = "
2885 << getNumVectors()
2886 << " != A.getNumVectors() = " << A.getNumVectors() << ".");
2887 const size_t numVecs = getNumVectors();
2888
2889 auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
2890 auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
2891
2892 if (isConstantStride() && A.isConstantStride()) {
2893 KokkosBlas::abs(this_view_dev, A_view_dev);
2894 } else {
2895 using Kokkos::ALL;
2896 using Kokkos::subview;
2897
2898 for (size_t k = 0; k < numVecs; ++k) {
2899 const size_t this_col = isConstantStride() ? k : whichVectors_[k];
2901 const size_t A_col = isConstantStride() ? k : A.whichVectors_[k];
2902 auto vector_Ak = subview(A_view_dev, ALL(), A_col);
2903 KokkosBlas::abs(vector_k, vector_Ak);
2904 }
2905 }
2906}
2907
2908template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2910 update(const Scalar& alpha,
2912 const Scalar& beta) {
2913 const char tfecfFuncName[] = "update: ";
2914 using Kokkos::ALL;
2915 using Kokkos::subview;
2916
2917 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::update(alpha,A,beta)");
2918
2919 const size_t lclNumRows = getLocalLength();
2920 const size_t numVecs = getNumVectors();
2921
2924 lclNumRows != A.getLocalLength(), std::invalid_argument,
2925 "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2926 << A.getLocalLength() << ".");
2928 numVecs != A.getNumVectors(), std::invalid_argument,
2929 "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2930 << A.getNumVectors() << ".");
2931 }
2932
2933 const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
2934 const impl_scalar_type theBeta = static_cast<impl_scalar_type>(beta);
2935 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2936 const std::pair<size_t, size_t> colRng(0, numVecs);
2937
2938 auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2939 auto Y_lcl = subview(Y_lcl_orig, rowRng, Kokkos::ALL());
2940 auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2941 auto X_lcl = subview(X_lcl_orig, rowRng, Kokkos::ALL());
2942
2943 // The device memory of *this is about to be modified
2944 if (isConstantStride() && A.isConstantStride()) {
2945 KokkosBlas::axpby(theAlpha, X_lcl, theBeta, Y_lcl);
2946 } else {
2947 // Make sure that Kokkos only uses the local length for add.
2948 for (size_t k = 0; k < numVecs; ++k) {
2949 const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2950 const size_t X_col = A.isConstantStride() ? k : A.whichVectors_[k];
2951 auto Y_k = subview(Y_lcl, ALL(), Y_col);
2952 auto X_k = subview(X_lcl, ALL(), X_col);
2953
2954 KokkosBlas::axpby(theAlpha, X_k, theBeta, Y_k);
2955 }
2956 }
2957}
2958
2959template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2961 update(const Scalar& alpha,
2963 const Scalar& beta,
2965 const Scalar& gamma) {
2966 using Kokkos::ALL;
2967 using Kokkos::subview;
2968
2969 const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
2970
2971 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::update(alpha,A,beta,B,gamma)");
2972
2973 const size_t lclNumRows = this->getLocalLength();
2974 const size_t numVecs = getNumVectors();
2975
2978 lclNumRows != A.getLocalLength(), std::invalid_argument,
2979 "The input MultiVector A has " << A.getLocalLength() << " local "
2980 "row(s), but this MultiVector has "
2981 << lclNumRows << " local "
2982 "row(s).");
2984 lclNumRows != B.getLocalLength(), std::invalid_argument,
2985 "The input MultiVector B has " << B.getLocalLength() << " local "
2986 "row(s), but this MultiVector has "
2987 << lclNumRows << " local "
2988 "row(s).");
2990 A.getNumVectors() != numVecs, std::invalid_argument,
2991 "The input MultiVector A has " << A.getNumVectors() << " column(s), "
2992 "but this MultiVector has "
2993 << numVecs << " column(s).");
2995 B.getNumVectors() != numVecs, std::invalid_argument,
2996 "The input MultiVector B has " << B.getNumVectors() << " column(s), "
2997 "but this MultiVector has "
2998 << numVecs << " column(s).");
2999 }
3000
3001 const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
3002 const impl_scalar_type theBeta = static_cast<impl_scalar_type>(beta);
3003 const impl_scalar_type theGamma = static_cast<impl_scalar_type>(gamma);
3004
3005 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
3006 const std::pair<size_t, size_t> colRng(0, numVecs);
3007
3008 // Prefer 'auto' over specifying the type explicitly. This avoids
3009 // issues with a subview possibly having a different type than the
3010 // original view.
3011 auto C_lcl = subview(this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL());
3012 auto A_lcl = subview(A.getLocalViewDevice(Access::ReadOnly), rowRng, ALL());
3013 auto B_lcl = subview(B.getLocalViewDevice(Access::ReadOnly), rowRng, ALL());
3014
3015 if (isConstantStride() && A.isConstantStride() && B.isConstantStride()) {
3016 KokkosBlas::update(theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3017 } else {
3018 // Some input (or *this) is not constant stride,
3019 // so perform the update one column at a time.
3020 for (size_t k = 0; k < numVecs; ++k) {
3021 const size_t this_col = isConstantStride() ? k : whichVectors_[k];
3022 const size_t A_col = A.isConstantStride() ? k : A.whichVectors_[k];
3023 const size_t B_col = B.isConstantStride() ? k : B.whichVectors_[k];
3024 KokkosBlas::update(theAlpha, subview(A_lcl, rowRng, A_col),
3025 theBeta, subview(B_lcl, rowRng, B_col),
3027 }
3028 }
3029}
3030
3031template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3032Teuchos::ArrayRCP<const Scalar>
3034 getData(size_t j) const {
3035 using Kokkos::ALL;
3036 using IST = impl_scalar_type;
3037 const char tfecfFuncName[] = "getData: ";
3038
3039 // Any MultiVector method that called the (classic) Kokkos Node's
3040 // viewBuffer or viewBufferNonConst methods always implied a
3041 // device->host synchronization. Thus, we synchronize here as
3042 // well.
3043
3044 auto hostView = getLocalViewHost(Access::ReadOnly);
3045 const size_t col = isConstantStride() ? j : whichVectors_[j];
3046 auto hostView_j = Kokkos::subview(hostView, ALL(), col);
3047 Teuchos::ArrayRCP<const IST> dataAsArcp =
3048 Kokkos::Compat::persistingView(hostView_j, 0, getLocalLength());
3049
3050 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(hostView_j.extent(0)) <
3051 static_cast<size_t>(dataAsArcp.size()),
3052 std::logic_error,
3053 "hostView_j.extent(0) = " << hostView_j.extent(0)
3054 << " < dataAsArcp.size() = " << dataAsArcp.size() << ". "
3055 "Please report this bug to the Tpetra developers.");
3056
3057 return Teuchos::arcp_reinterpret_cast<const Scalar>(dataAsArcp);
3058}
3059
3060template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3061Teuchos::ArrayRCP<Scalar>
3063 getDataNonConst(size_t j) {
3064 using Kokkos::ALL;
3065 using Kokkos::subview;
3066 using IST = impl_scalar_type;
3067 const char tfecfFuncName[] = "getDataNonConst: ";
3068
3069 auto hostView = getLocalViewHost(Access::ReadWrite);
3070 const size_t col = isConstantStride() ? j : whichVectors_[j];
3071 auto hostView_j = subview(hostView, ALL(), col);
3072 Teuchos::ArrayRCP<IST> dataAsArcp =
3073 Kokkos::Compat::persistingView(hostView_j, 0, getLocalLength());
3074
3075 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(hostView_j.extent(0)) <
3076 static_cast<size_t>(dataAsArcp.size()),
3077 std::logic_error,
3078 "hostView_j.extent(0) = " << hostView_j.extent(0)
3079 << " < dataAsArcp.size() = " << dataAsArcp.size() << ". "
3080 "Please report this bug to the Tpetra developers.");
3081
3082 return Teuchos::arcp_reinterpret_cast<Scalar>(dataAsArcp);
3083}
3084
3085template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3086Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3088 subCopy(const Teuchos::ArrayView<const size_t>& cols) const {
3089 using Teuchos::RCP;
3091
3092 // Check whether the index set in cols is contiguous. If it is,
3093 // use the more efficient Range1D version of subCopy.
3094 bool contiguous = true;
3095 const size_t numCopyVecs = static_cast<size_t>(cols.size());
3096 for (size_t j = 1; j < numCopyVecs; ++j) {
3097 if (cols[j] != cols[j - 1] + static_cast<size_t>(1)) {
3098 contiguous = false;
3099 break;
3100 }
3101 }
3102 if (contiguous && numCopyVecs > 0) {
3103 return this->subCopy(Teuchos::Range1D(cols[0], cols[numCopyVecs - 1]));
3104 } else {
3105 RCP<const MV> X_sub = this->subView(cols);
3106 RCP<MV> Y(new MV(this->getMap(), numCopyVecs, false));
3107 Y->assign(*X_sub);
3108 return Y;
3109 }
3110}
3111
3112template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3113Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3115 subCopy(const Teuchos::Range1D& colRng) const {
3116 using Teuchos::RCP;
3118
3119 RCP<const MV> X_sub = this->subView(colRng);
3120 RCP<MV> Y(new MV(this->getMap(), static_cast<size_t>(colRng.size()), false));
3121 Y->assign(*X_sub);
3122 return Y;
3123}
3124
3125template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3126size_t
3128 getOrigNumLocalRows() const {
3129 return view_.origExtent(0);
3130}
3131
3132template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3133size_t
3135 getOrigNumLocalCols() const {
3136 return view_.origExtent(1);
3137}
3138
3139template <class Scalar, class LO, class GO, class Node>
3142 const Teuchos::RCP<const map_type>& subMap,
3143 const local_ordinal_type rowOffset)
3144 : base_type(subMap) {
3145 using Kokkos::ALL;
3146 using Kokkos::subview;
3147 using std::endl;
3148 using Teuchos::outArg;
3149 using Teuchos::RCP;
3150 using Teuchos::rcp;
3151 using Teuchos::REDUCE_MIN;
3152 using Teuchos::reduceAll;
3154 const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3155 const char suffix[] = "Please report this bug to the Tpetra developers.";
3156 int lclGood = 1;
3157 int gblGood = 1;
3158 std::unique_ptr<std::ostringstream> errStrm;
3159 const bool debug = ::Tpetra::Details::Behavior::debug();
3160 const bool verbose = ::Tpetra::Details::Behavior::verbose();
3161
3162 // Be careful to use the input Map's communicator, not X's. The
3163 // idea is that, on return, *this is a subview of X, using the
3164 // input Map.
3165 const auto comm = subMap->getComm();
3166 TEUCHOS_ASSERT(!comm.is_null());
3167 const int myRank = comm->getRank();
3168
3169 const LO lclNumRowsBefore = static_cast<LO>(X.getLocalLength());
3170 const LO numCols = static_cast<LO>(X.getNumVectors());
3171 const LO newNumRows = static_cast<LO>(subMap->getLocalNumElements());
3172 if (verbose) {
3173 std::ostringstream os;
3174 os << "Proc " << myRank << ": " << prefix
3175 << "X: {lclNumRows: " << lclNumRowsBefore
3176 << ", origLclNumRows: " << X.getOrigNumLocalRows()
3177 << ", numCols: " << numCols << "}, "
3178 << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3179 std::cerr << os.str();
3180 }
3181 // We ask for the _original_ number of rows in X, because X could
3182 // be a shorter (fewer rows) view of a longer MV. (For example, X
3183 // could be a domain Map view of a column Map MV.)
3184 const bool tooManyElts =
3185 newNumRows + rowOffset > static_cast<LO>(X.getOrigNumLocalRows());
3186 if (tooManyElts) {
3187 errStrm = std::unique_ptr<std::ostringstream>(new std::ostringstream);
3188 *errStrm << " Proc " << myRank << ": subMap->getLocalNumElements() (="
3189 << newNumRows << ") + rowOffset (=" << rowOffset
3190 << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows()
3191 << ")." << endl;
3192 lclGood = 0;
3193 TEUCHOS_TEST_FOR_EXCEPTION(!debug && tooManyElts, std::invalid_argument,
3194 prefix << errStrm->str() << suffix);
3195 }
3196
3197 if (debug) {
3199 if (gblGood != 1) {
3200 std::ostringstream gblErrStrm;
3201 const std::string myErrStr =
3202 errStrm.get() != nullptr ? errStrm->str() : std::string("");
3204 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, gblErrStrm.str());
3205 }
3206 }
3207
3208 using range_type = std::pair<LO, LO>;
3209 const range_type origRowRng(rowOffset, static_cast<LO>(X.view_.origExtent(0)));
3210 const range_type rowRng(rowOffset, rowOffset + newNumRows);
3211
3212 wrapped_dual_view_type newView = takeSubview(X.view_, rowRng, ALL());
3213
3214 // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3215 // handling subviews of degenerate Views quite so well. For some
3216 // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3217 // 0. We work around by creating a new empty DualView of the
3218 // desired (degenerate) dimensions.
3219 if (newView.extent(0) == 0 &&
3220 newView.extent(1) != X.view_.extent(1)) {
3221 newView =
3222 allocDualView<Scalar, LO, GO, Node>(0, X.getNumVectors());
3223 }
3224
3225 MV subViewMV = X.isConstantStride() ? MV(subMap, newView) : MV(subMap, newView, X.whichVectors_());
3226
3227 if (debug) {
3228 const LO lclNumRowsRet = static_cast<LO>(subViewMV.getLocalLength());
3229 const LO numColsRet = static_cast<LO>(subViewMV.getNumVectors());
3230 if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3231 lclGood = 0;
3232 if (errStrm.get() == nullptr) {
3233 errStrm = std::unique_ptr<std::ostringstream>(new std::ostringstream);
3234 }
3235 *errStrm << " Proc " << myRank << ": subMap.getLocalNumElements(): " << newNumRows << ", subViewMV.getLocalLength(): " << lclNumRowsRet << ", X.getNumVectors(): " << numCols << ", subViewMV.getNumVectors(): " << numColsRet << endl;
3236 }
3238 if (gblGood != 1) {
3239 std::ostringstream gblErrStrm;
3240 if (myRank == 0) {
3241 gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3242 "dimensions on one or more processes:"
3243 << endl;
3244 }
3245 const std::string myErrStr =
3246 errStrm.get() != nullptr ? errStrm->str() : std::string("");
3248 gblErrStrm << suffix << endl;
3249 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, gblErrStrm.str());
3250 }
3251 }
3252
3253 if (verbose) {
3254 std::ostringstream os;
3255 os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3256 std::cerr << os.str();
3257 }
3258
3259 *this = subViewMV; // shallow copy
3260
3261 if (verbose) {
3262 std::ostringstream os;
3263 os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3264 std::cerr << os.str();
3265 }
3266}
3267
3268template <class Scalar, class LO, class GO, class Node>
3271 const map_type& subMap,
3272 const size_t rowOffset)
3273 : MultiVector(X, Teuchos::RCP<const map_type>(new map_type(subMap)),
3275
3276template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3277Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3279 offsetView(const Teuchos::RCP<const map_type>& subMap,
3280 const size_t offset) const {
3282 return Teuchos::rcp(new MV(*this, *subMap, offset));
3283}
3284
3285template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3286Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3288 offsetViewNonConst(const Teuchos::RCP<const map_type>& subMap,
3289 const size_t offset) {
3291 return Teuchos::rcp(new MV(*this, *subMap, offset));
3292}
3293
3294template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3295Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3297 subView(const Teuchos::ArrayView<const size_t>& cols) const {
3298 using Teuchos::Array;
3299 using Teuchos::rcp;
3301
3302 const size_t numViewCols = static_cast<size_t>(cols.size());
3304 numViewCols < 1, std::runtime_error,
3305 "Tpetra::MultiVector::subView"
3306 "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3307 "contain at least one entry, but cols.size() = "
3308 << cols.size()
3309 << " == 0.");
3310
3311 // Check whether the index set in cols is contiguous. If it is,
3312 // use the more efficient Range1D version of subView.
3313 bool contiguous = true;
3314 for (size_t j = 1; j < numViewCols; ++j) {
3315 if (cols[j] != cols[j - 1] + static_cast<size_t>(1)) {
3316 contiguous = false;
3317 break;
3318 }
3319 }
3320 if (contiguous) {
3321 if (numViewCols == 0) {
3322 // The output MV has no columns, so there is nothing to view.
3323 return rcp(new MV(this->getMap(), numViewCols));
3324 } else {
3325 // Use the more efficient contiguous-index-range version.
3326 return this->subView(Teuchos::Range1D(cols[0], cols[numViewCols - 1]));
3327 }
3328 }
3329
3330 if (isConstantStride()) {
3331 return rcp(new MV(this->getMap(), view_, cols));
3332 } else {
3333 Array<size_t> newcols(cols.size());
3334 for (size_t j = 0; j < numViewCols; ++j) {
3335 newcols[j] = whichVectors_[cols[j]];
3336 }
3337 return rcp(new MV(this->getMap(), view_, newcols()));
3338 }
3339}
3340
3341template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3342Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3344 subView(const Teuchos::Range1D& colRng) const {
3345 using Kokkos::ALL;
3346 using Kokkos::subview;
3347 using Teuchos::Array;
3348 using Teuchos::RCP;
3349 using Teuchos::rcp;
3350 using ::Tpetra::Details::Behavior;
3352 const char tfecfFuncName[] = "subView(Range1D): ";
3353
3354 const size_t lclNumRows = this->getLocalLength();
3355 const size_t numVecs = this->getNumVectors();
3356 // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3357 // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3358 // "at least one vector.");
3360 static_cast<size_t>(colRng.size()) > numVecs, std::runtime_error,
3361 "colRng.size() = " << colRng.size() << " > this->getNumVectors() = "
3362 << numVecs << ".");
3364 numVecs != 0 && colRng.size() != 0 &&
3365 (colRng.lbound() < static_cast<Teuchos::Ordinal>(0) ||
3366 static_cast<size_t>(colRng.ubound()) >= numVecs),
3367 std::invalid_argument, "Nonempty input range [" << colRng.lbound() << "," << colRng.ubound() << "] exceeds the valid range of column indices "
3368 "[0, "
3369 << numVecs << "].");
3370
3371 RCP<const MV> X_ret; // the MultiVector subview to return
3372
3373 // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3374 // broken for the case of views with zero rows. I will brutally
3375 // enforce that the subview has the correct dimensions. In
3376 // particular, in the case of zero rows, I will, if necessary,
3377 // create a new dual_view_type with zero rows and the correct
3378 // number of columns. In a debug build, I will use an all-reduce
3379 // to ensure that it has the correct dimensions on all processes.
3380
3381 const std::pair<size_t, size_t> rows(0, lclNumRows);
3382 if (colRng.size() == 0) {
3383 const std::pair<size_t, size_t> cols(0, 0); // empty range
3384 wrapped_dual_view_type X_sub = takeSubview(this->view_, ALL(), cols);
3385 X_ret = rcp(new MV(this->getMap(), X_sub));
3386 } else {
3387 // Returned MultiVector is constant stride only if *this is.
3388 if (isConstantStride()) {
3389 const std::pair<size_t, size_t> cols(colRng.lbound(),
3390 colRng.ubound() + 1);
3391 wrapped_dual_view_type X_sub = takeSubview(this->view_, ALL(), cols);
3392 X_ret = rcp(new MV(this->getMap(), X_sub));
3393 } else {
3394 if (static_cast<size_t>(colRng.size()) == static_cast<size_t>(1)) {
3395 // We're only asking for one column, so the result does have
3396 // constant stride, even though this MultiVector does not.
3397 const std::pair<size_t, size_t> col(whichVectors_[0] + colRng.lbound(),
3398 whichVectors_[0] + colRng.ubound() + 1);
3400 X_ret = rcp(new MV(this->getMap(), X_sub));
3401 } else {
3402 Array<size_t> which(whichVectors_.begin() + colRng.lbound(),
3403 whichVectors_.begin() + colRng.ubound() + 1);
3404 X_ret = rcp(new MV(this->getMap(), view_, which));
3405 }
3406 }
3407 }
3408
3409 const bool debug = Behavior::debug();
3410 if (debug) {
3411 using Teuchos::Comm;
3412 using Teuchos::outArg;
3413 using Teuchos::REDUCE_MIN;
3414 using Teuchos::reduceAll;
3415
3416 RCP<const Comm<int> > comm = this->getMap().is_null() ? Teuchos::null : this->getMap()->getComm();
3417 if (!comm.is_null()) {
3418 int lclSuccess = 1;
3419 int gblSuccess = 1;
3420
3421 if (X_ret.is_null()) {
3422 lclSuccess = 0;
3423 }
3426 "X_ret (the subview of this "
3427 "MultiVector; the return value of this method) is null on some MPI "
3428 "process in this MultiVector's communicator. This should never "
3429 "happen. Please report this bug to the Tpetra developers.");
3430 if (!X_ret.is_null() &&
3431 X_ret->getNumVectors() != static_cast<size_t>(colRng.size())) {
3432 lclSuccess = 0;
3433 }
3437 "X_ret->getNumVectors() != "
3438 "colRng.size(), on at least one MPI process in this MultiVector's "
3439 "communicator. This should never happen. "
3440 "Please report this bug to the Tpetra developers.");
3441 }
3442 }
3443 return X_ret;
3444}
3445
3446template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3447Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3449 subViewNonConst(const Teuchos::ArrayView<const size_t>& cols) {
3451 return Teuchos::rcp_const_cast<MV>(this->subView(cols));
3452}
3453
3454template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3455Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3457 subViewNonConst(const Teuchos::Range1D& colRng) {
3459 return Teuchos::rcp_const_cast<MV>(this->subView(colRng));
3460}
3461
3462template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3465 const size_t j)
3466 : base_type(X.getMap()) {
3467 using Kokkos::subview;
3468 typedef std::pair<size_t, size_t> range_type;
3469 const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3470
3471 const size_t numCols = X.getNumVectors();
3472 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(j >= numCols, std::invalid_argument, "Input index j (== " << j << ") exceeds valid column index range [0, " << numCols << " - 1].");
3473 const size_t jj = X.isConstantStride() ? static_cast<size_t>(j) : static_cast<size_t>(X.whichVectors_[j]);
3474 this->view_ = takeSubview(X.view_, Kokkos::ALL(), range_type(jj, jj + 1));
3475
3476 // mfh 31 Jul 2017: It would be unwise to execute concurrent
3477 // Export or Import operations with different subviews of a
3478 // MultiVector. Thus, it is safe to reuse communication buffers.
3479 // See #1560 discussion.
3480 //
3481 // We only need one column's worth of buffer for imports_ and
3482 // exports_. Taking subviews now ensures that their lengths will
3483 // be exactly what we need, so we won't have to resize them later.
3484 {
3485 const size_t newSize = X.imports_.extent(0) / numCols;
3486 const size_t offset = jj * newSize;
3487 auto device_view = subview(X.imports_.view_device(),
3488 range_type(offset, offset + newSize));
3489 auto host_view = subview(X.imports_.view_host(),
3490 range_type(offset, offset + newSize));
3491 this->imports_ = decltype(X.imports_)(device_view, host_view);
3492 }
3493 {
3494 const size_t newSize = X.exports_.extent(0) / numCols;
3495 const size_t offset = jj * newSize;
3496 auto device_view = subview(X.exports_.view_device(),
3497 range_type(offset, offset + newSize));
3498 auto host_view = subview(X.exports_.view_host(),
3499 range_type(offset, offset + newSize));
3500 this->exports_ = decltype(X.exports_)(device_view, host_view);
3501 }
3502 // These two DualViews already either have the right number of
3503 // entries, or zero entries. This means that we don't need to
3504 // resize them.
3505 this->numImportPacketsPerLID_ = X.numImportPacketsPerLID_;
3506 this->numExportPacketsPerLID_ = X.numExportPacketsPerLID_;
3507}
3508
3509template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3510Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3512 getVector(const size_t j) const {
3514 return Teuchos::rcp(new V(*this, j));
3515}
3516
3517template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3518Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3520 getVectorNonConst(const size_t j) {
3522 return Teuchos::rcp(new V(*this, j));
3523}
3524
3525template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3527 get1dCopy(const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const {
3528 using IST = impl_scalar_type;
3529 using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3530 Kokkos::HostSpace,
3531 Kokkos::MemoryUnmanaged>;
3532 const char tfecfFuncName[] = "get1dCopy: ";
3533
3534 const size_t numRows = this->getLocalLength();
3535 const size_t numCols = this->getNumVectors();
3536
3538 "LDA = " << LDA << " < numRows = " << numRows << ".");
3539 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numRows > size_t(0) && numCols > size_t(0) &&
3540 size_t(A.size()) < LDA * (numCols - 1) + numRows,
3541 std::runtime_error,
3542 "A.size() = " << A.size() << ", but its size must be at least "
3543 << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3544
3545 const std::pair<size_t, size_t> rowRange(0, numRows);
3546 const std::pair<size_t, size_t> colRange(0, numCols);
3547
3548 input_view_type A_view_orig(reinterpret_cast<IST*>(A.getRawPtr()),
3549 LDA, numCols);
3550 auto A_view = Kokkos::subview(A_view_orig, rowRange, colRange);
3551
3563 if (this->need_sync_host() && this->need_sync_device()) {
3566 throw std::runtime_error("Tpetra::MultiVector: A non-const view is alive outside and we cannot give a copy where host or device view can be modified outside");
3567 } else {
3568 const bool useHostView = view_.host_view_use_count() >= view_.device_view_use_count();
3569 if (this->isConstantStride()) {
3570 if (useHostView) {
3571 auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3572 // DEEP_COPY REVIEW - NOT TESTED
3573 Kokkos::deep_copy(A_view, srcView_host);
3574 } else {
3575 auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3576 // DEEP_COPY REVIEW - NOT TESTED
3577 Kokkos::deep_copy(A_view, srcView_device);
3578 }
3579 } else {
3580 for (size_t j = 0; j < numCols; ++j) {
3581 const size_t srcCol = this->whichVectors_[j];
3582 auto dstColView = Kokkos::subview(A_view, rowRange, j);
3583
3584 if (useHostView) {
3585 auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3586 auto srcColView_host = Kokkos::subview(srcView_host, rowRange, srcCol);
3587 // DEEP_COPY REVIEW - NOT TESTED
3588 Kokkos::deep_copy(dstColView, srcColView_host);
3589 } else {
3590 auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3591 auto srcColView_device = Kokkos::subview(srcView_device, rowRange, srcCol);
3592 // DEEP_COPY REVIEW - NOT TESTED
3593 Kokkos::deep_copy(dstColView, srcColView_device);
3594 }
3595 }
3596 }
3597 }
3598}
3599
3600template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3602 get2dCopy(const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const {
3604 const char tfecfFuncName[] = "get2dCopy: ";
3605 const size_t numRows = this->getLocalLength();
3606 const size_t numCols = this->getNumVectors();
3607
3609 static_cast<size_t>(ArrayOfPtrs.size()) != numCols,
3610 std::runtime_error,
3611 "Input array of pointers must contain as many "
3612 "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3613 << ArrayOfPtrs.size() << " != getNumVectors() = " << numCols << ".");
3614
3615 if (numRows != 0 && numCols != 0) {
3616 // No side effects until we've validated the input.
3617 for (size_t j = 0; j < numCols; ++j) {
3618 const size_t dstLen = static_cast<size_t>(ArrayOfPtrs[j].size());
3620 dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3621 "the input array of arrays is not long enough to fit all entries in "
3622 "that column of the MultiVector. ArrayOfPtrs[j].size() = "
3623 << dstLen << " < getLocalLength() = " << numRows << ".");
3624 }
3625
3626 // We've validated the input, so it's safe to start copying.
3627 for (size_t j = 0; j < numCols; ++j) {
3628 Teuchos::RCP<const V> X_j = this->getVector(j);
3629 const size_t LDA = static_cast<size_t>(ArrayOfPtrs[j].size());
3630 X_j->get1dCopy(ArrayOfPtrs[j], LDA);
3631 }
3632 }
3633}
3634
3635template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3636Teuchos::ArrayRCP<const Scalar>
3638 get1dView() const {
3639 if (getLocalLength() == 0 || getNumVectors() == 0) {
3640 return Teuchos::null;
3641 } else {
3643 !isConstantStride(), std::runtime_error,
3644 "Tpetra::MultiVector::"
3645 "get1dView: This MultiVector does not have constant stride, so it is "
3646 "not possible to view its data as a single array. You may check "
3647 "whether a MultiVector has constant stride by calling "
3648 "isConstantStride().");
3649 // Since get1dView() is and was always marked const, I have to
3650 // cast away const here in order not to break backwards
3651 // compatibility.
3652 auto X_lcl = getLocalViewHost(Access::ReadOnly);
3653 Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3654 Kokkos::Compat::persistingView(X_lcl);
3655 return Teuchos::arcp_reinterpret_cast<const Scalar>(dataAsArcp);
3656 }
3657}
3658
3659template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3660Teuchos::ArrayRCP<Scalar>
3663 if (this->getLocalLength() == 0 || this->getNumVectors() == 0) {
3664 return Teuchos::null;
3665 } else {
3666 TEUCHOS_TEST_FOR_EXCEPTION(!isConstantStride(), std::runtime_error,
3667 "Tpetra::MultiVector::"
3668 "get1dViewNonConst: This MultiVector does not have constant stride, "
3669 "so it is not possible to view its data as a single array. You may "
3670 "check whether a MultiVector has constant stride by calling "
3671 "isConstantStride().");
3672 auto X_lcl = getLocalViewHost(Access::ReadWrite);
3673 Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3674 Kokkos::Compat::persistingView(X_lcl);
3675 return Teuchos::arcp_reinterpret_cast<Scalar>(dataAsArcp);
3676 }
3677}
3678
3679template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3680Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3683 auto X_lcl = getLocalViewHost(Access::ReadWrite);
3684
3685 // Don't use the row range here on the outside, in order to avoid
3686 // a strided return type (in case Kokkos::subview is conservative
3687 // about that). Instead, use the row range for the column views
3688 // in the loop.
3689 const size_t myNumRows = this->getLocalLength();
3690 const size_t numCols = this->getNumVectors();
3691 const Kokkos::pair<size_t, size_t> rowRange(0, myNumRows);
3692
3693 Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views(numCols);
3694 for (size_t j = 0; j < numCols; ++j) {
3695 const size_t col = this->isConstantStride() ? j : this->whichVectors_[j];
3696 auto X_lcl_j = Kokkos::subview(X_lcl, rowRange, col);
3697 Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3698 Kokkos::Compat::persistingView(X_lcl_j);
3699 views[j] = Teuchos::arcp_reinterpret_cast<Scalar>(X_lcl_j_arcp);
3700 }
3701 return views;
3702}
3703
3704template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3707 getLocalViewHost(Access::ReadOnlyStruct s) const {
3708 return view_.getHostView(s);
3709}
3710
3711template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3714 getLocalViewHost(Access::ReadWriteStruct s) {
3715 return view_.getHostView(s);
3716}
3717
3718template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3721 getLocalViewHost(Access::OverwriteAllStruct s) {
3722 return view_.getHostView(s);
3723}
3724
3725template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3728 getLocalViewDevice(Access::ReadOnlyStruct s) const {
3729 return view_.getDeviceView(s);
3730}
3731
3732template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3735 getLocalViewDevice(Access::ReadWriteStruct s) {
3736 return view_.getDeviceView(s);
3737}
3738
3739template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3742 getLocalViewDevice(Access::OverwriteAllStruct s) {
3743 return view_.getDeviceView(s);
3744}
3745
3746template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3752
3753template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3754Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3756 get2dView() const {
3757 // Since get2dView() is and was always marked const, I have to
3758 // cast away const here in order not to break backwards
3759 // compatibility.
3760 auto X_lcl = getLocalViewHost(Access::ReadOnly);
3761
3762 // Don't use the row range here on the outside, in order to avoid
3763 // a strided return type (in case Kokkos::subview is conservative
3764 // about that). Instead, use the row range for the column views
3765 // in the loop.
3766 const size_t myNumRows = this->getLocalLength();
3767 const size_t numCols = this->getNumVectors();
3768 const Kokkos::pair<size_t, size_t> rowRange(0, myNumRows);
3769
3770 Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views(numCols);
3771 for (size_t j = 0; j < numCols; ++j) {
3772 const size_t col = this->isConstantStride() ? j : this->whichVectors_[j];
3773 auto X_lcl_j = Kokkos::subview(X_lcl, rowRange, col);
3774 Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
3775 Kokkos::Compat::persistingView(X_lcl_j);
3776 views[j] = Teuchos::arcp_reinterpret_cast<const Scalar>(X_lcl_j_arcp);
3777 }
3778 return views;
3779}
3780
3781template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3783 multiply(Teuchos::ETransp transA,
3784 Teuchos::ETransp transB,
3785 const Scalar& alpha,
3788 const Scalar& beta) {
3789 using std::endl;
3790 using Teuchos::CONJ_TRANS;
3791 using Teuchos::NO_TRANS;
3792 using Teuchos::RCP;
3793 using Teuchos::rcp;
3794 using Teuchos::rcpFromRef;
3795 using Teuchos::TRANS;
3796 using ::Tpetra::Details::ProfilingRegion;
3797 using ATS = KokkosKernels::ArithTraits<impl_scalar_type>;
3798 using LO = local_ordinal_type;
3799 using STS = Teuchos::ScalarTraits<Scalar>;
3801 const char tfecfFuncName[] = "multiply: ";
3802 ProfilingRegion region("Tpetra::MV::multiply");
3803
3804 // This routine performs a variety of matrix-matrix multiply
3805 // operations, interpreting the MultiVector (this-aka C , A and B)
3806 // as 2D matrices. Variations are due to the fact that A, B and C
3807 // can be locally replicated or globally distributed MultiVectors
3808 // and that we may or may not operate with the transpose of A and
3809 // B. Possible cases are:
3810 //
3811 // Operations # Cases Notes
3812 // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
3813 // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
3814 // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
3815 //
3816 // The following operations are not meaningful for 1-D
3817 // distributions:
3818 //
3819 // u1) C(local) = A^T(distr) * B^T(distr) 1
3820 // u2) C(local) = A (distr) * B^X(distr) 2
3821 // u3) C(distr) = A^X(local) * B^X(local) 4
3822 // u4) C(distr) = A^X(local) * B^X(distr) 4
3823 // u5) C(distr) = A^T(distr) * B^X(local) 2
3824 // u6) C(local) = A^X(distr) * B^X(local) 4
3825 // u7) C(distr) = A^X(distr) * B^X(local) 4
3826 // u8) C(local) = A^X(local) * B^X(distr) 4
3827 //
3828 // Total number of cases: 32 (= 2^5).
3829
3830 impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
3831
3832 const bool A_is_local = !A.isDistributed();
3833 const bool B_is_local = !B.isDistributed();
3834 const bool C_is_local = !this->isDistributed();
3835
3836 // In debug mode, check compatibility of local dimensions. We
3837 // only do this in debug mode, since it requires an all-reduce
3838 // to ensure correctness on all processses. It's entirely
3839 // possible that only some processes may have incompatible local
3840 // dimensions. Throwing an exception only on those processes
3841 // could cause this method to hang.
3842 const bool debug = ::Tpetra::Details::Behavior::debug();
3843 if (debug) {
3844 auto myMap = this->getMap();
3845 if (!myMap.is_null() && !myMap->getComm().is_null()) {
3846 using Teuchos::outArg;
3847 using Teuchos::REDUCE_MIN;
3848 using Teuchos::reduceAll;
3849
3850 auto comm = myMap->getComm();
3851 const size_t A_nrows =
3852 (transA != NO_TRANS) ? A.getNumVectors() : A.getLocalLength();
3853 const size_t A_ncols =
3854 (transA != NO_TRANS) ? A.getLocalLength() : A.getNumVectors();
3855 const size_t B_nrows =
3856 (transB != NO_TRANS) ? B.getNumVectors() : B.getLocalLength();
3857 const size_t B_ncols =
3858 (transB != NO_TRANS) ? B.getLocalLength() : B.getNumVectors();
3859
3860 const bool lclBad = this->getLocalLength() != A_nrows ||
3861 this->getNumVectors() != B_ncols || A_ncols != B_nrows;
3862
3863 const int myRank = comm->getRank();
3864 std::ostringstream errStrm;
3865 if (this->getLocalLength() != A_nrows) {
3866 errStrm << "Proc " << myRank << ": this->getLocalLength()="
3867 << this->getLocalLength() << " != A_nrows=" << A_nrows
3868 << "." << std::endl;
3869 }
3870 if (this->getNumVectors() != B_ncols) {
3871 errStrm << "Proc " << myRank << ": this->getNumVectors()="
3872 << this->getNumVectors() << " != B_ncols=" << B_ncols
3873 << "." << std::endl;
3874 }
3875 if (A_ncols != B_nrows) {
3876 errStrm << "Proc " << myRank << ": A_ncols="
3877 << A_ncols << " != B_nrows=" << B_nrows
3878 << "." << std::endl;
3879 }
3880
3881 const int lclGood = lclBad ? 0 : 1;
3882 int gblGood = 0;
3884 if (gblGood != 1) {
3885 std::ostringstream os;
3887
3888 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
3889 "Inconsistent local dimensions on at "
3890 "least one process in this object's communicator."
3891 << std::endl
3892 << "Operation: "
3893 << "C(" << (C_is_local ? "local" : "distr") << ") = "
3894 << alpha << "*A"
3895 << (transA == Teuchos::TRANS ? "^T" : (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3896 << "(" << (A_is_local ? "local" : "distr") << ") * "
3897 << beta << "*B"
3898 << (transA == Teuchos::TRANS ? "^T" : (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3899 << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
3900 << "Global dimensions: C(" << this->getGlobalLength() << ", "
3901 << this->getNumVectors() << "), A(" << A.getGlobalLength()
3902 << ", " << A.getNumVectors() << "), B(" << B.getGlobalLength()
3903 << ", " << B.getNumVectors() << ")." << std::endl
3904 << os.str());
3905 }
3906 }
3907 }
3908
3909 // Case 1: C(local) = A^X(local) * B^X(local)
3910 const bool Case1 = C_is_local && A_is_local && B_is_local;
3911 // Case 2: C(local) = A^T(distr) * B (distr)
3912 const bool Case2 = C_is_local && !A_is_local && !B_is_local &&
3913 transA != NO_TRANS &&
3914 transB == NO_TRANS;
3915 // Case 3: C(distr) = A (distr) * B^X(local)
3916 const bool Case3 = !C_is_local && !A_is_local && B_is_local &&
3917 transA == NO_TRANS;
3918
3919 // Test that we are considering a meaningful case
3920 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!Case1 && !Case2 && !Case3, std::runtime_error,
3921 "Multiplication of op(A) and op(B) into *this is not a "
3922 "supported use case.");
3923
3924 if (beta != STS::zero() && Case2) {
3925 // If Case2, then C is local and contributions must be summed
3926 // across all processes. However, if beta != 0, then accumulate
3927 // beta*C into the sum. When summing across all processes, we
3928 // only want to accumulate this once, so set beta == 0 on all
3929 // processes except Process 0.
3930 const int myRank = this->getMap()->getComm()->getRank();
3931 if (myRank != 0) {
3932 beta_local = ATS::zero();
3933 }
3934 }
3935
3936 // We only know how to do matrix-matrix multiplies if all the
3937 // MultiVectors have constant stride. If not, we have to make
3938 // temporary copies of those MultiVectors (including possibly
3939 // *this) that don't have constant stride.
3940 RCP<MV> C_tmp;
3941 if (!isConstantStride()) {
3942 C_tmp = rcp(new MV(*this, Teuchos::Copy)); // deep copy
3943 } else {
3944 C_tmp = rcp(this, false);
3945 }
3946
3948 if (!A.isConstantStride()) {
3949 A_tmp = rcp(new MV(A, Teuchos::Copy)); // deep copy
3950 } else {
3951 A_tmp = rcpFromRef(A);
3952 }
3953
3955 if (!B.isConstantStride()) {
3956 B_tmp = rcp(new MV(B, Teuchos::Copy)); // deep copy
3957 } else {
3958 B_tmp = rcpFromRef(B);
3959 }
3960
3961 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!C_tmp->isConstantStride() || !B_tmp->isConstantStride() ||
3962 !A_tmp->isConstantStride(),
3963 std::logic_error,
3964 "Failed to make temporary constant-stride copies of MultiVectors.");
3965
3966 {
3967 const LO A_lclNumRows = A_tmp->getLocalLength();
3968 const LO A_numVecs = A_tmp->getNumVectors();
3969 auto A_lcl = A_tmp->getLocalViewDevice(Access::ReadOnly);
3970 auto A_sub = Kokkos::subview(A_lcl,
3971 std::make_pair(LO(0), A_lclNumRows),
3972 std::make_pair(LO(0), A_numVecs));
3973
3974 const LO B_lclNumRows = B_tmp->getLocalLength();
3975 const LO B_numVecs = B_tmp->getNumVectors();
3976 auto B_lcl = B_tmp->getLocalViewDevice(Access::ReadOnly);
3977 auto B_sub = Kokkos::subview(B_lcl,
3978 std::make_pair(LO(0), B_lclNumRows),
3979 std::make_pair(LO(0), B_numVecs));
3980
3981 const LO C_lclNumRows = C_tmp->getLocalLength();
3982 const LO C_numVecs = C_tmp->getNumVectors();
3983
3984 auto C_lcl = C_tmp->getLocalViewDevice(Access::ReadWrite);
3985 auto C_sub = Kokkos::subview(C_lcl,
3986 std::make_pair(LO(0), C_lclNumRows),
3987 std::make_pair(LO(0), C_numVecs));
3988 const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' : (transA == Teuchos::TRANS ? 'T' : 'C'));
3989 const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' : (transB == Teuchos::TRANS ? 'T' : 'C'));
3991
3992 ProfilingRegion regionGemm("Tpetra::MV::multiply-call-gemm");
3993
3994 KokkosBlas::gemm(&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
3995 beta_local, C_sub);
3996 }
3997
3998 if (!isConstantStride()) {
3999 ::Tpetra::deep_copy(*this, *C_tmp); // Copy the result back into *this.
4000 }
4001
4002 // Dispose of (possibly) extra copies of A and B.
4003 A_tmp = Teuchos::null;
4004 B_tmp = Teuchos::null;
4005
4006 // If Case 2 then sum up *this and distribute it to all processes.
4007 if (Case2) {
4008 this->reduce();
4009 }
4010}
4011
4012template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4018 using Kokkos::ALL;
4019 using Kokkos::subview;
4020 const char tfecfFuncName[] = "elementWiseMultiply: ";
4021
4022 const size_t lclNumRows = this->getLocalLength();
4023 const size_t numVecs = this->getNumVectors();
4024
4025 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclNumRows != A.getLocalLength() || lclNumRows != B.getLocalLength(),
4026 std::runtime_error, "MultiVectors do not have the same local length.");
4028 numVecs != B.getNumVectors(), std::runtime_error,
4029 "this->getNumVectors"
4030 "() = "
4031 << numVecs << " != B.getNumVectors() = " << B.getNumVectors()
4032 << ".");
4033
4034 auto this_view = this->getLocalViewDevice(Access::ReadWrite);
4035 auto A_view = A.getLocalViewDevice(Access::ReadOnly);
4036 auto B_view = B.getLocalViewDevice(Access::ReadOnly);
4037
4038 if (isConstantStride() && B.isConstantStride()) {
4039 // A is just a Vector; it only has one column, so it always has
4040 // constant stride.
4041 //
4042 // If both *this and B have constant stride, we can do an
4043 // element-wise multiply on all columns at once.
4044 KokkosBlas::mult(scalarThis,
4045 this_view,
4046 scalarAB,
4047 subview(A_view, ALL(), 0),
4048 B_view);
4049 } else {
4050 for (size_t j = 0; j < numVecs; ++j) {
4051 const size_t C_col = isConstantStride() ? j : whichVectors_[j];
4052 const size_t B_col = B.isConstantStride() ? j : B.whichVectors_[j];
4053 KokkosBlas::mult(scalarThis,
4055 scalarAB,
4056 subview(A_view, ALL(), 0),
4057 subview(B_view, ALL(), B_col));
4058 }
4059 }
4060}
4061
4062template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4064 reduce() {
4065 using ::Tpetra::Details::allReduceView;
4066 using ::Tpetra::Details::ProfilingRegion;
4067 ProfilingRegion region("Tpetra::MV::reduce");
4068
4069 const auto map = this->getMap();
4070 if (map.get() == nullptr) {
4071 return;
4072 }
4073 const auto comm = map->getComm();
4074 if (comm.get() == nullptr) {
4075 return;
4076 }
4077
4078 // Avoid giving device buffers to MPI. Even if MPI handles them
4079 // correctly, doing so may not perform well.
4080 const bool changed_on_device = this->need_sync_host();
4081 if (changed_on_device) {
4082 // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4083 // and host will be separate allocations. In that case, it may
4084 // pay to do the all-reduce from device to host.
4085 Kokkos::fence("MultiVector::reduce"); // for UVM getLocalViewDevice is UVM which can be read as host by allReduceView, so we must not read until device is fenced
4086 auto X_lcl = this->getLocalViewDevice(Access::ReadWrite);
4087 allReduceView(X_lcl, X_lcl, *comm);
4088 } else {
4089 auto X_lcl = this->getLocalViewHost(Access::ReadWrite);
4090 allReduceView(X_lcl, X_lcl, *comm);
4091 }
4092}
4093
4094template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4097 const size_t col,
4100 const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4101 const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4104 std::runtime_error,
4105 "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4106 << " is invalid. The range of valid row indices on this process "
4107 << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4108 << ", " << maxLocalIndex << "].");
4110 vectorIndexOutOfRange(col),
4111 std::runtime_error,
4112 "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4113 << " of the multivector is invalid.");
4114 }
4115
4116 auto view = this->getLocalViewHost(Access::ReadWrite);
4117 const size_t colInd = isConstantStride() ? col : whichVectors_[col];
4118 view(lclRow, colInd) = ScalarValue;
4119}
4120
4121template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4124 const size_t col,
4125 const impl_scalar_type& value,
4126 const bool atomic) {
4128 const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4129 const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4132 std::runtime_error,
4133 "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4134 << " is invalid. The range of valid row indices on this process "
4135 << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4136 << ", " << maxLocalIndex << "].");
4138 vectorIndexOutOfRange(col),
4139 std::runtime_error,
4140 "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4141 << " of the multivector is invalid.");
4142 }
4143
4144 const size_t colInd = isConstantStride() ? col : whichVectors_[col];
4145
4146 auto view = this->getLocalViewHost(Access::ReadWrite);
4147 if (atomic) {
4148 Kokkos::atomic_add(&(view(lclRow, colInd)), value);
4149 } else {
4150 view(lclRow, colInd) += value;
4151 }
4152}
4153
4154template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4157 const size_t col,
4159 // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4160 // touches the RCP's reference count, which isn't thread safe.
4161 const LocalOrdinal lclRow = this->map_->getLocalElement(gblRow);
4162
4164 const char tfecfFuncName[] = "replaceGlobalValue: ";
4165 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid(),
4166 std::runtime_error,
4167 "Global row index " << gblRow << " is not present on this process "
4168 << this->getMap()->getComm()->getRank() << ".");
4169 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->vectorIndexOutOfRange(col), std::runtime_error,
4170 "Vector index " << col << " of the MultiVector is invalid.");
4171 }
4172
4173 this->replaceLocalValue(lclRow, col, ScalarValue);
4174}
4175
4176template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4179 const size_t col,
4180 const impl_scalar_type& value,
4181 const bool atomic) {
4182 // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4183 // touches the RCP's reference count, which isn't thread safe.
4184 const LocalOrdinal lclRow = this->map_->getLocalElement(globalRow);
4185
4188 lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid(),
4189 std::runtime_error,
4190 "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4191 << " is not present on this process "
4192 << this->getMap()->getComm()->getRank() << ".");
4194 vectorIndexOutOfRange(col),
4195 std::runtime_error,
4196 "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4197 << " of the multivector is invalid.");
4198 }
4199
4200 this->sumIntoLocalValue(lclRow, col, value, atomic);
4201}
4202
4203template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4204template <class T>
4205Teuchos::ArrayRCP<T>
4207 getSubArrayRCP(Teuchos::ArrayRCP<T> arr,
4208 size_t j) const {
4209 typedef Kokkos::DualView<impl_scalar_type*,
4210 typename dual_view_type::array_layout,
4213 const size_t col = isConstantStride() ? j : whichVectors_[j];
4215 Kokkos::subview(view_, Kokkos::ALL(), col);
4216 return Kokkos::Compat::persistingView(X_col.view_device());
4217}
4218
4219template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4221 need_sync_host() const {
4222 return view_.need_sync_host();
4223}
4224
4225template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4227 need_sync_device() const {
4228 return view_.need_sync_device();
4229}
4230
4231template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4232std::string
4234 descriptionImpl(const std::string& className) const {
4235 using Teuchos::TypeNameTraits;
4236
4237 std::ostringstream out;
4238 out << "\"" << className << "\": {";
4239 out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name()
4240 << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name()
4241 << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name()
4242 << ", Node" << Node::name()
4243 << "}, ";
4244 if (this->getObjectLabel() != "") {
4245 out << "Label: \"" << this->getObjectLabel() << "\", ";
4246 }
4247 out << ", numRows: " << this->getGlobalLength();
4248 if (className != "Tpetra::Vector") {
4249 out << ", numCols: " << this->getNumVectors()
4250 << ", isConstantStride: " << this->isConstantStride();
4251 }
4252 if (this->isConstantStride()) {
4253 out << ", columnStride: " << this->getStride();
4254 }
4255 out << "}";
4256
4257 return out.str();
4258}
4259
4260template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4261std::string
4263 description() const {
4264 return this->descriptionImpl("Tpetra::MultiVector");
4265}
4266
4267namespace Details {
4268template <typename ViewType>
4269void print_vector(Teuchos::FancyOStream& out, const char* prefix, const ViewType& v) {
4270 using std::endl;
4271 static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename ViewType::memory_space>::accessible,
4272 "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a host-accessible view.");
4273 static_assert(ViewType::rank == 2,
4274 "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a rank-2 view.");
4275 // The square braces [] and their contents are in Matlab
4276 // format, so users may copy and paste directly into Matlab.
4277 out << "Values(" << prefix << "): " << std::endl
4278 << "[";
4279 const size_t numRows = v.extent(0);
4280 const size_t numCols = v.extent(1);
4281 if (numCols == 1) {
4282 for (size_t i = 0; i < numRows; ++i) {
4283 out << v(i, 0);
4284 if (i + 1 < numRows) {
4285 out << "; ";
4286 }
4287 }
4288 } else {
4289 for (size_t i = 0; i < numRows; ++i) {
4290 for (size_t j = 0; j < numCols; ++j) {
4291 out << v(i, j);
4292 if (j + 1 < numCols) {
4293 out << ", ";
4294 }
4295 }
4296 if (i + 1 < numRows) {
4297 out << "; ";
4298 }
4299 }
4300 }
4301 out << "]" << endl;
4302}
4303} // namespace Details
4304
4305template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4306std::string
4308 localDescribeToString(const Teuchos::EVerbosityLevel vl) const {
4309 typedef LocalOrdinal LO;
4310 using std::endl;
4311
4312 if (vl <= Teuchos::VERB_LOW) {
4313 return std::string();
4314 }
4315 auto map = this->getMap();
4316 if (map.is_null()) {
4317 return std::string();
4318 }
4319 auto outStringP = Teuchos::rcp(new std::ostringstream());
4320 auto outp = Teuchos::getFancyOStream(outStringP);
4321 Teuchos::FancyOStream& out = *outp;
4322 auto comm = map->getComm();
4323 const int myRank = comm->getRank();
4324 const int numProcs = comm->getSize();
4325
4326 out << "Process " << myRank << " of " << numProcs << ":" << endl;
4327 Teuchos::OSTab tab1(out);
4328
4329 // VERB_MEDIUM and higher prints getLocalLength()
4330 const LO lclNumRows = static_cast<LO>(this->getLocalLength());
4331 out << "Local number of rows: " << lclNumRows << endl;
4332
4333 if (vl > Teuchos::VERB_MEDIUM) {
4334 // VERB_HIGH and higher prints isConstantStride() and getStride().
4335 // The first is only relevant if the Vector has multiple columns.
4336 if (this->getNumVectors() != static_cast<size_t>(1)) {
4337 out << "isConstantStride: " << this->isConstantStride() << endl;
4338 }
4339 if (this->isConstantStride()) {
4340 out << "Column stride: " << this->getStride() << endl;
4341 }
4342
4343 if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4344 // VERB_EXTREME prints values. If we're not doing univied memory, we'll
4346 // so we can't use our regular accessor functins
4347
4348 // NOTE: This is an occasion where we do *not* want the auto-sync stuff
4349 // to trigger (since this function is conceptually const). Thus, we
4350 // get *copies* of the view's data instead.
4351 auto X_dev = view_.getDeviceCopy();
4352 auto X_host = view_.getHostCopy();
4353
4354 if (X_dev.data() == X_host.data()) {
4355 // One single allocation
4356 Details::print_vector(out, "unified", X_host);
4357 } else {
4358 Details::print_vector(out, "host", X_host);
4359 Details::print_vector(out, "dev", X_dev);
4360 }
4361 }
4362 }
4363 out.flush(); // make sure the ostringstream got everything
4364 return outStringP->str();
4365}
4366
4367template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4369 describeImpl(Teuchos::FancyOStream& out,
4370 const std::string& className,
4371 const Teuchos::EVerbosityLevel verbLevel) const {
4372 using std::endl;
4373 using Teuchos::TypeNameTraits;
4374 using Teuchos::VERB_DEFAULT;
4375 using Teuchos::VERB_LOW;
4376 using Teuchos::VERB_NONE;
4377 const Teuchos::EVerbosityLevel vl =
4379
4380 if (vl == VERB_NONE) {
4381 return; // don't print anything
4382 }
4383 // If this Vector's Comm is null, then the Vector does not
4384 // participate in collective operations with the other processes.
4385 // In that case, it is not even legal to call this method. The
4386 // reasonable thing to do in that case is nothing.
4387 auto map = this->getMap();
4388 if (map.is_null()) {
4389 return;
4390 }
4391 auto comm = map->getComm();
4392 if (comm.is_null()) {
4393 return;
4394 }
4395
4396 const int myRank = comm->getRank();
4397
4398 // Only Process 0 should touch the output stream, but this method
4399 // in general may need to do communication. Thus, we may need to
4400 // preserve the current tab level across multiple "if (myRank ==
4401 // 0) { ... }" inner scopes. This is why we sometimes create
4402 // OSTab instances by pointer, instead of by value. We only need
4403 // to create them by pointer if the tab level must persist through
4404 // multiple inner scopes.
4405 Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4406
4407 // VERB_LOW and higher prints the equivalent of description().
4408 if (myRank == 0) {
4409 tab0 = Teuchos::rcp(new Teuchos::OSTab(out));
4410 out << "\"" << className << "\":" << endl;
4411 tab1 = Teuchos::rcp(new Teuchos::OSTab(out));
4412 {
4413 out << "Template parameters:" << endl;
4414 Teuchos::OSTab tab2(out);
4415 out << "Scalar: " << TypeNameTraits<Scalar>::name() << endl
4416 << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name() << endl
4417 << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name() << endl
4418 << "Node: " << Node::name() << endl;
4419 }
4420 if (this->getObjectLabel() != "") {
4421 out << "Label: \"" << this->getObjectLabel() << "\", ";
4422 }
4423 out << "Global number of rows: " << this->getGlobalLength() << endl;
4424 if (className != "Tpetra::Vector") {
4425 out << "Number of columns: " << this->getNumVectors() << endl;
4426 }
4427 // getStride() may differ on different processes, so it (and
4428 // isConstantStride()) properly belong to per-process data.
4429 }
4430
4431 // This is collective over the Map's communicator.
4432 if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4433 const std::string lclStr = this->localDescribeToString(vl);
4435 }
4436}
4437
4438template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4440 describe(Teuchos::FancyOStream& out,
4441 const Teuchos::EVerbosityLevel verbLevel) const {
4442 this->describeImpl(out, "Tpetra::MultiVector", verbLevel);
4443}
4444
4445template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4447 removeEmptyProcessesInPlace(const Teuchos::RCP<const map_type>& newMap) {
4448 replaceMap(newMap);
4449}
4450
4451template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4454 using ::Tpetra::Details::localDeepCopy;
4455 const char prefix[] = "Tpetra::MultiVector::assign: ";
4456
4457 TEUCHOS_TEST_FOR_EXCEPTION(this->getGlobalLength() != src.getGlobalLength() ||
4458 this->getNumVectors() != src.getNumVectors(),
4459 std::invalid_argument,
4460 prefix << "Global dimensions of the two Tpetra::MultiVector "
4461 "objects do not match. src has dimensions ["
4462 << src.getGlobalLength()
4463 << "," << src.getNumVectors() << "], and *this has dimensions ["
4464 << this->getGlobalLength() << "," << this->getNumVectors() << "].");
4465 // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4466 TEUCHOS_TEST_FOR_EXCEPTION(this->getLocalLength() != src.getLocalLength(), std::invalid_argument,
4467 prefix << "The local row counts of the two Tpetra::MultiVector "
4468 "objects do not match. src has "
4469 << src.getLocalLength() << " row(s) "
4470 << " and *this has " << this->getLocalLength() << " row(s).");
4471
4472 // See #1510. We're writing to *this, so we don't care about its
4473 // contents in either memory space, and we don't want
4474 // DualView::modify to complain about "concurrent modification" of
4475 // host and device Views.
4476
4477 // KJ: this is problematic. assign funtion is used to construct a subvector
4478 // if the sync flag is reset here, it lose all our control over getLocalView interface
4479 // this->clear_sync_state();
4480
4481 // If need sync to device, then host has most recent version.
4482 const bool src_last_updated_on_host = src.need_sync_device();
4483
4485 localDeepCopy(this->getLocalViewHost(Access::ReadWrite),
4486 src.getLocalViewHost(Access::ReadOnly),
4487 this->isConstantStride(),
4488 src.isConstantStride(),
4489 this->whichVectors_(),
4490 src.whichVectors_());
4491 } else {
4492 localDeepCopy(this->getLocalViewDevice(Access::ReadWrite),
4493 src.getLocalViewDevice(Access::ReadOnly),
4494 this->isConstantStride(),
4495 src.isConstantStride(),
4496 this->whichVectors_(),
4497 src.whichVectors_());
4498 }
4499}
4500
4501template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4502template <class T>
4503Teuchos::RCP<MultiVector<T, LocalOrdinal, GlobalOrdinal, Node> >
4505 convert() const {
4506 using Teuchos::RCP;
4507
4508 auto newMV = Teuchos::rcp(new MultiVector<T, LocalOrdinal, GlobalOrdinal, Node>(this->getMap(),
4509 this->getNumVectors(),
4510 /*zeroOut=*/false));
4511 Tpetra::deep_copy(*newMV, *this);
4512 return newMV;
4513}
4514
4515template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4518 using ::Tpetra::Details::PackTraits;
4519
4520 const size_t l1 = this->getLocalLength();
4521 const size_t l2 = vec.getLocalLength();
4522 if ((l1 != l2) || (this->getNumVectors() != vec.getNumVectors())) {
4523 return false;
4524 }
4525
4526 return true;
4527}
4528
4529template <class ST, class LO, class GO, class NT>
4532 std::swap(mv.map_, this->map_);
4533 std::swap(mv.view_, this->view_);
4534 std::swap(mv.whichVectors_, this->whichVectors_);
4535}
4536
4537#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4538template <class ST, class LO, class GO, class NT>
4540 const Teuchos::SerialDenseMatrix<int, ST>& src) {
4541 using ::Tpetra::Details::localDeepCopy;
4542 using MV = MultiVector<ST, LO, GO, NT>;
4543 using IST = typename MV::impl_scalar_type;
4544 using input_view_type =
4545 Kokkos::View<const IST**, Kokkos::LayoutLeft,
4546 Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4547 using pair_type = std::pair<LO, LO>;
4548
4549 const LO numRows = static_cast<LO>(src.numRows());
4550 const LO numCols = static_cast<LO>(src.numCols());
4551
4552 TEUCHOS_TEST_FOR_EXCEPTION(numRows != static_cast<LO>(dst.getLocalLength()) ||
4553 numCols != static_cast<LO>(dst.getNumVectors()),
4554 std::invalid_argument, "Tpetra::deep_copy: On Process " << dst.getMap()->getComm()->getRank() << ", dst is " << dst.getLocalLength() << " x " << dst.getNumVectors() << ", but src is " << numRows << " x " << numCols << ".");
4555
4556 const IST* src_raw = reinterpret_cast<const IST*>(src.values());
4557 input_view_type src_orig(src_raw, src.stride(), numCols);
4558 input_view_type src_view =
4559 Kokkos::subview(src_orig, pair_type(0, numRows), Kokkos::ALL());
4560
4561 constexpr bool src_isConstantStride = true;
4562 Teuchos::ArrayView<const size_t> srcWhichVectors(nullptr, 0);
4563 localDeepCopy(dst.getLocalViewHost(Access::ReadWrite),
4564 src_view,
4565 dst.isConstantStride(),
4567 getMultiVectorWhichVectors(dst),
4569}
4570
4571template <class ST, class LO, class GO, class NT>
4572void deep_copy(Teuchos::SerialDenseMatrix<int, ST>& dst,
4573 const MultiVector<ST, LO, GO, NT>& src) {
4574 using ::Tpetra::Details::localDeepCopy;
4575 using MV = MultiVector<ST, LO, GO, NT>;
4576 using IST = typename MV::impl_scalar_type;
4577 using output_view_type =
4578 Kokkos::View<IST**, Kokkos::LayoutLeft,
4579 Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4580 using pair_type = std::pair<LO, LO>;
4581
4582 const LO numRows = static_cast<LO>(dst.numRows());
4583 const LO numCols = static_cast<LO>(dst.numCols());
4584
4585 TEUCHOS_TEST_FOR_EXCEPTION(numRows != static_cast<LO>(src.getLocalLength()) ||
4586 numCols != static_cast<LO>(src.getNumVectors()),
4587 std::invalid_argument, "Tpetra::deep_copy: On Process " << src.getMap()->getComm()->getRank() << ", src is " << src.getLocalLength() << " x " << src.getNumVectors() << ", but dst is " << numRows << " x " << numCols << ".");
4588
4589 IST* dst_raw = reinterpret_cast<IST*>(dst.values());
4590 output_view_type dst_orig(dst_raw, dst.stride(), numCols);
4591 auto dst_view =
4592 Kokkos::subview(dst_orig, pair_type(0, numRows), Kokkos::ALL());
4593
4594 constexpr bool dst_isConstantStride = true;
4595 Teuchos::ArrayView<const size_t> dstWhichVectors(nullptr, 0);
4596
4597 // Prefer the host version of src's data.
4598 localDeepCopy(dst_view,
4599 src.getLocalViewHost(Access::ReadOnly),
4601 src.isConstantStride(),
4603 getMultiVectorWhichVectors(src));
4604}
4605#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4606
4607template <class Scalar, class LO, class GO, class NT>
4608Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4609createMultiVector(const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4610 size_t numVectors) {
4612 return Teuchos::rcp(new MV(map, numVectors));
4613}
4614
4615template <class ST, class LO, class GO, class NT>
4618 typedef MultiVector<ST, LO, GO, NT> MV;
4619 MV cpy(src.getMap(), src.getNumVectors(), false);
4620 cpy.assign(src);
4621 return cpy;
4622}
4623
4624} // namespace Tpetra
4625
4626//
4627// Explicit instantiation macro
4628//
4629// Must be expanded from within the Tpetra namespace!
4630//
4631
4632#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4633#define TPETRA_MULTIVECTOR_INSTANT(SCALAR, LO, GO, NODE) \
4634 template class MultiVector<SCALAR, LO, GO, NODE>; \
4635 template MultiVector<SCALAR, LO, GO, NODE> createCopy(const MultiVector<SCALAR, LO, GO, NODE>& src); \
4636 template Teuchos::RCP<MultiVector<SCALAR, LO, GO, NODE> > createMultiVector(const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4637 template void deep_copy(MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4638 template void deep_copy(Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4639
4640#else
4641#define TPETRA_MULTIVECTOR_INSTANT(SCALAR, LO, GO, NODE) \
4642 template class MultiVector<SCALAR, LO, GO, NODE>; \
4643 template MultiVector<SCALAR, LO, GO, NODE> createCopy(const MultiVector<SCALAR, LO, GO, NODE>& src);
4644
4645#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4646
4647#define TPETRA_MULTIVECTOR_CONVERT_INSTANT(SO, SI, LO, GO, NODE) \
4648 \
4649 template Teuchos::RCP<MultiVector<SO, LO, GO, NODE> > \
4650 MultiVector<SI, LO, GO, NODE>::convert<SO>() const;
4651
4652#endif // TPETRA_MULTIVECTOR_DEF_HPP
Functions for initializing and finalizing Tpetra.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration of functions for checking whether a given pointer is accessible from a given Kokkos execu...
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
Declaration of a function that prints strings from each process.
Declaration of Tpetra::Details::isInterComm.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
static bool assumeMpiIsGPUAware()
Whether to assume that MPI is CUDA aware.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
static size_t multivectorKernelLocationThreshold()
the threshold for transitioning from device to host
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool remoteLIDsContiguous=false, const CombineMode CM=INSERT)
Reallocate imports_ if needed.
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
A parallel distribution of indices over processes.
One or more distributed dense vectors.
void reciprocal(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,...
void normInf(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the infinity-norm of each vector (column), storing the result in a host View.
void get1dCopy(const Teuchos::ArrayView< Scalar > &A, const size_t LDA) const
Fill the given array with a copy of this multivector's local values.
size_t getStride() const
Stride between columns in the multivector.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool areRemoteLIDsContiguous=false, const CombineMode CM=INSERT) override
Reallocate imports_ if needed.
void reduce()
Sum values of a locally replicated multivector across all processes.
virtual bool checkSizes(const SrcDistObject &sourceObj) override
Whether data redistribution between sourceObj and this object is legal.
void randomize()
Set all values in the multivector to pseudorandom numbers.
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subView(const Teuchos::Range1D &colRng) const
Return a const MultiVector with const views of selected columns.
void dot(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Teuchos::ArrayView< dot_type > &dots) const
Compute the dot product of each corresponding pair of vectors (columns) in A and B.
void describeImpl(Teuchos::FancyOStream &out, const std::string &className, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Implementation of describe() for this class, and its subclass Vector.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap) override
Remove processes owning zero rows from the Map and their communicator.
Teuchos::ArrayRCP< Scalar > get1dViewNonConst()
Nonconst persisting (1-D) view of this multivector's local values.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
Teuchos::RCP< Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVectorNonConst(const size_t j)
Return a Vector which is a nonconst view of column j.
void meanValue(const Teuchos::ArrayView< impl_scalar_type > &means) const
Compute mean (average) value of each column.
std::string descriptionImpl(const std::string &className) const
Implementation of description() for this class, and its subclass Vector.
void multiply(Teuchos::ETransp transA, Teuchos::ETransp transB, const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Scalar &beta)
Matrix-matrix multiplication: this = beta*this + alpha*op(A)*op(B).
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
wrapped_dual_view_type view_
The Kokkos::DualView containing the MultiVector's data.
void replaceLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using local (row) index.
void swap(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &mv)
Swap contents of mv with contents of *this.
size_t getLocalLength() const
Local number of rows on the calling process.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subCopy(const Teuchos::Range1D &colRng) const
Return a MultiVector with copies of selected columns.
typename Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner ("dot") product result.
Teuchos::ArrayRCP< const Scalar > getData(size_t j) const
Const view of the local values in a particular vector of this multivector.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM, const execution_space &space) override
Same as copyAndPermute, but do operations in space.
void norm2(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the two-norm of each vector (column), storing the result in a host View.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
Teuchos::ArrayRCP< const Scalar > get1dView() const
Const persisting (1-D) view of this multivector's local values.
Teuchos::ArrayRCP< T > getSubArrayRCP(Teuchos::ArrayRCP< T > arr, size_t j) const
Persisting view of j-th column in the given ArrayRCP.
void replaceGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using global row index.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, device_type > dual_view_type
Kokkos::DualView specialization used by this class.
typename KokkosKernels::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
void elementWiseMultiply(Scalar scalarAB, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarThis)
Multiply a Vector A elementwise by a MultiVector B.
size_t getNumVectors() const
Number of columns in the multivector.
virtual size_t constantNumberOfPackets() const override
Number of packets to send per LID.
wrapped_dual_view_type getWrappedDualView() const
Return the wrapped dual view holding this MultiVector's local data.
void sumIntoLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &val, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using local row index.
void replaceMap(const Teuchos::RCP< const map_type > &map)
Replace the underlying Map in place.
Teuchos::RCP< MultiVector< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another MultiVector with the same entries, but converted to a different Scalar type T.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
void abs(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise absolute values of input Multi-vector in target: A = abs(this)
void norm1(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the one-norm of each vector (column), storing the result in a host view.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const override
Print the object with the given verbosity level to a FancyOStream.
void get2dCopy(const Teuchos::ArrayView< const Teuchos::ArrayView< Scalar > > &ArrayOfPtrs) const
Fill the given array with a copy of this multivector's local values.
void sumIntoGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using global row index.
bool aliases(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &other) const
Whether this multivector's memory might alias other. This is conservative: if either this or other is...
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on host. This requires that ther...
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subViewNonConst(const Teuchos::Range1D &colRng)
Return a MultiVector with views of selected columns.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
Teuchos::ArrayRCP< Scalar > getDataNonConst(size_t j)
View of the local values in a particular vector of this multivector.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
size_t getOrigNumLocalCols() const
"Original" number of columns in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< Scalar > > get2dViewNonConst()
Return non-const persisting pointers to values.
virtual std::string description() const override
A simple one-line description of this object.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetViewNonConst(const Teuchos::RCP< const map_type > &subMap, const size_t offset)
Return a nonconst view of a subset of rows.
std::string localDescribeToString(const Teuchos::EVerbosityLevel vl) const
Print the calling process' verbose describe() information to the returned string.
bool isSameSize(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &vec) const
size_t getOrigNumLocalRows() const
"Original" number of rows in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< const Scalar > > get2dView() const
Return const persisting pointers to values.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
typename KokkosKernels::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
Abstract base class for objects that can be the source of an Import or Export operation.
Implementation details of Tpetra.
int local_ordinal_type
Default value of Scalar template parameter.
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
bool isInterComm(const Teuchos::Comm< int > &)
Return true if and only if the input communicator wraps an MPI intercommunicator.
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector "constructor": Create a MultiVector from a given Map.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes)
@ INSERT
Insert new values that don't currently exist.