Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_MULTIVECTOR_DEF_HPP
11#define TPETRA_MULTIVECTOR_DEF_HPP
12
20
21#include "Tpetra_Core.hpp"
22#include "Tpetra_Util.hpp"
23#include "Tpetra_Vector.hpp"
35#include "Tpetra_Details_Random.hpp"
36#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
37#include "Teuchos_SerialDenseMatrix.hpp"
38#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
39#include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
40#include "KokkosCompat_View.hpp"
41#include "KokkosBlas.hpp"
42#include "KokkosKernels_Utils.hpp"
43#include "Kokkos_Random.hpp"
44#include "KokkosKernels_ArithTraits.hpp"
45#include <memory>
46#include <sstream>
47
48#ifdef HAVE_TPETRA_INST_FLOAT128
49namespace Kokkos {
50// FIXME (mfh 04 Sep 2015) Just a stub for now!
51template <class Generator>
52struct rand<Generator, __float128> {
53 static KOKKOS_INLINE_FUNCTION __float128 max() {
54 return static_cast<__float128>(1.0);
55 }
56 static KOKKOS_INLINE_FUNCTION __float128
57 draw(Generator& gen) {
58 // Half the smallest normalized double, is the scaling factor of
59 // the lower-order term in the double-double representation.
60 const __float128 scalingFactor =
61 static_cast<__float128>(std::numeric_limits<double>::min()) /
62 static_cast<__float128>(2.0);
63 const __float128 higherOrderTerm = static_cast<__float128>(gen.drand());
64 const __float128 lowerOrderTerm =
65 static_cast<__float128>(gen.drand()) * scalingFactor;
66 return higherOrderTerm + lowerOrderTerm;
67 }
68 static KOKKOS_INLINE_FUNCTION __float128
69 draw(Generator& gen, const __float128& range) {
70 // FIXME (mfh 05 Sep 2015) Not sure if this is right.
71 const __float128 scalingFactor =
72 static_cast<__float128>(std::numeric_limits<double>::min()) /
73 static_cast<__float128>(2.0);
74 const __float128 higherOrderTerm =
75 static_cast<__float128>(gen.drand(range));
76 const __float128 lowerOrderTerm =
77 static_cast<__float128>(gen.drand(range)) * scalingFactor;
78 return higherOrderTerm + lowerOrderTerm;
79 }
80 static KOKKOS_INLINE_FUNCTION __float128
81 draw(Generator& gen, const __float128& start, const __float128& end) {
82 // FIXME (mfh 05 Sep 2015) Not sure if this is right.
83 const __float128 scalingFactor =
84 static_cast<__float128>(std::numeric_limits<double>::min()) /
85 static_cast<__float128>(2.0);
86 const __float128 higherOrderTerm =
87 static_cast<__float128>(gen.drand(start, end));
88 const __float128 lowerOrderTerm =
89 static_cast<__float128>(gen.drand(start, end)) * scalingFactor;
90 return higherOrderTerm + lowerOrderTerm;
91 }
92};
93} // namespace Kokkos
94#endif // HAVE_TPETRA_INST_FLOAT128
95
96namespace { // (anonymous)
97
112template <class ST, class LO, class GO, class NT>
114allocDualView(const size_t lclNumRows,
115 const size_t numCols,
116 const bool zeroOut = true,
117 const bool allowPadding = false) {
118 using Kokkos::AllowPadding;
119 using Kokkos::view_alloc;
120 using Kokkos::WithoutInitializing;
121 using ::Tpetra::Details::Behavior;
122 typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
123 typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::wrapped_dual_view_type wrapped_dual_view_type;
124 typedef typename dual_view_type::t_dev dev_view_type;
125
126 // This needs to be a string and not a char*, if given as an
127 // argument to Kokkos::view_alloc. This is because view_alloc
128 // also allows a raw pointer as its first argument. See
129 // https://github.com/kokkos/kokkos/issues/434.
130 const std::string label("MV::DualView");
131 const bool debug = Behavior::debug();
132
133 // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
134 // creation of the DualView's Views works around
135 // Kokkos::DualView's current inability to accept an
136 // AllocationProperties initial argument (as Kokkos::View does).
137 // However, the work-around is harmless, since it does what the
138 // (currently nonexistent) equivalent DualView constructor would
139 // have done anyway.
140
141 dev_view_type d_view;
142 if (zeroOut) {
143 if (allowPadding) {
144 d_view = dev_view_type(view_alloc(label, AllowPadding),
145 lclNumRows, numCols);
146 } else {
147 d_view = dev_view_type(view_alloc(label),
148 lclNumRows, numCols);
149 }
150 } else {
151 if (allowPadding) {
152 d_view = dev_view_type(view_alloc(label,
153 WithoutInitializing,
154 AllowPadding),
155 lclNumRows, numCols);
156 } else {
157 d_view = dev_view_type(view_alloc(label, WithoutInitializing),
158 lclNumRows, numCols);
159 }
160 if (debug) {
161 // Filling with NaN is a cheap and effective way to tell if
162 // downstream code is trying to use a MultiVector's data
163 // without them having been initialized. ArithTraits lets us
164 // call nan() even if the scalar type doesn't define it; it
165 // just returns some undefined value in the latter case. This
166 // won't hurt anything because by setting zeroOut=false, users
167 // already agreed that they don't care about the contents of
168 // the MultiVector.
169 const ST nan = KokkosKernels::ArithTraits<ST>::nan();
170 KokkosBlas::fill(d_view, nan);
171 }
172 }
173 if (debug) {
174 TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(d_view.extent(0)) != lclNumRows ||
175 static_cast<size_t>(d_view.extent(1)) != numCols,
176 std::logic_error,
177 "allocDualView: d_view's dimensions actual dimensions do not match "
178 "requested dimensions. d_view is "
179 << d_view.extent(0) << " x " << d_view.extent(1) << "; requested " << lclNumRows << " x " << numCols
180 << ". Please report this bug to the Tpetra developers.");
181 }
182
183 return wrapped_dual_view_type(d_view);
184}
185
186// Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
187//
188// T: The type of the entries of the View.
189// ExecSpace: The Kokkos execution space.
190template <class T, class ExecSpace>
191struct MakeUnmanagedView {
192 // The 'false' part of the branch carefully ensures that this
193 // won't attempt to use a host execution space that hasn't been
194 // initialized. For example, if Kokkos::OpenMP is disabled and
195 // Kokkos::Threads is enabled, the latter is always the default
196 // execution space of Kokkos::HostSpace, even when ExecSpace is
197 // Kokkos::Serial. That's why we go through the trouble of asking
198 // Kokkos::DualView what _its_ space is. That seems to work
199 // around this default execution space issue.
200 //
201 typedef typename std::conditional<
202 Kokkos::SpaceAccessibility<
203 typename ExecSpace::memory_space,
204 Kokkos::HostSpace>::accessible,
205 typename ExecSpace::device_type,
206 typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
207 typedef Kokkos::LayoutLeft array_layout;
208 typedef Kokkos::View<T*, array_layout, host_exec_space,
209 Kokkos::MemoryUnmanaged>
210 view_type;
211
212 static view_type getView(const Teuchos::ArrayView<T>& x_in) {
213 const size_t numEnt = static_cast<size_t>(x_in.size());
214 if (numEnt == 0) {
215 return view_type();
216 } else {
217 return view_type(x_in.getRawPtr(), numEnt);
218 }
219 }
220};
221
222template <class WrappedDualViewType>
223WrappedDualViewType
224takeSubview(const WrappedDualViewType& X,
225 const std::pair<size_t, size_t>& rowRng,
226 const Kokkos::ALL_t& colRng)
227
228{
229 // The bug we saw below should be harder to trigger here.
230 return WrappedDualViewType(X, rowRng, colRng);
231}
232
233template <class WrappedDualViewType>
234WrappedDualViewType
235takeSubview(const WrappedDualViewType& X,
236 const Kokkos::ALL_t& rowRng,
237 const std::pair<size_t, size_t>& colRng) {
238 using DualViewType = typename WrappedDualViewType::DVT;
239 // Look carefullly at the comment in the below version of this function.
240 // The comment applies here as well.
241 if (X.extent(0) == 0 && X.extent(1) != 0) {
242 return WrappedDualViewType(DualViewType("MV::DualView", 0, colRng.second - colRng.first));
243 } else {
244 return WrappedDualViewType(X, rowRng, colRng);
245 }
246}
247
248template <class WrappedDualViewType>
249WrappedDualViewType
250takeSubview(const WrappedDualViewType& X,
251 const std::pair<size_t, size_t>& rowRng,
252 const std::pair<size_t, size_t>& colRng) {
253 using DualViewType = typename WrappedDualViewType::DVT;
254 // If you take a subview of a view with zero rows Kokkos::subview()
255 // always returns a DualView with the same data pointers. This will break
256 // pointer equality testing in between two subviews of the same 2D View if
257 // it has zero row extent. While the one (known) case where this was actually used
258 // has been fixed, that sort of check could very easily be reintroduced in the future,
259 // hence I've added this if check here.
260 //
261 // This is not a bug in Kokkos::subview(), just some very subtle behavior which
262 // future developers should be wary of.
263 if (X.extent(0) == 0 && X.extent(1) != 0) {
264 return WrappedDualViewType(DualViewType("MV::DualView", 0, colRng.second - colRng.first));
265 } else {
266 return WrappedDualViewType(X, rowRng, colRng);
267 }
268}
269
270template <class WrappedOrNotDualViewType>
271size_t
272getDualViewStride(const WrappedOrNotDualViewType& dv) {
273 // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
274 // method yet, but its Views do.
275 // NOTE: dv.stride() returns a vector of length one
276 // more than its rank
277 size_t strides[WrappedOrNotDualViewType::t_dev::rank + 1];
278 dv.stride(strides);
279 const size_t LDA = strides[1];
280 const size_t numRows = dv.extent(0);
281
282 if (LDA == 0) {
283 return (numRows == 0) ? size_t(1) : numRows;
284 } else {
285 return LDA;
286 }
287}
288
289template <class ViewType>
290size_t
291getViewStride(const ViewType& view) {
292 const size_t LDA = view.stride(1);
293 const size_t numRows = view.extent(0);
294
295 if (LDA == 0) {
296 return (numRows == 0) ? size_t(1) : numRows;
297 } else {
298 return LDA;
299 }
300}
301
302template <class impl_scalar_type, class buffer_device_type>
303bool runKernelOnHost(
304 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports) {
305 if (!imports.need_sync_device()) {
306 return false; // most up-to-date on device
307 } else { // most up-to-date on host,
308 // but if large enough, worth running on device anyway
309 size_t localLengthThreshold =
311 return imports.extent(0) <= localLengthThreshold;
312 }
313}
314
315template <class SC, class LO, class GO, class NT>
316bool runKernelOnHost(const ::Tpetra::MultiVector<SC, LO, GO, NT>& X) {
317 if (!X.need_sync_device()) {
318 return false; // most up-to-date on device
319 } else { // most up-to-date on host
320 // but if large enough, worth running on device anyway
321 size_t localLengthThreshold =
323 return X.getLocalLength() <= localLengthThreshold;
324 }
325}
326
327template <class SC, class LO, class GO, class NT>
328void multiVectorNormImpl(typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
330 const ::Tpetra::Details::EWhichNorm whichNorm) {
331 using namespace Tpetra;
332 using ::Tpetra::Details::normImpl;
334 using val_type = typename MV::impl_scalar_type;
335 using mag_type = typename MV::mag_type;
336 using dual_view_type = typename MV::dual_view_type;
337
338 auto map = X.getMap();
339 auto comm = map.is_null() ? nullptr : map->getComm().getRawPtr();
340 auto whichVecs = getMultiVectorWhichVectors(X);
341 const bool isConstantStride = X.isConstantStride();
342 const bool isDistributed = X.isDistributed();
343
344 const bool runOnHost = runKernelOnHost(X);
345 if (runOnHost) {
346 using view_type = typename dual_view_type::t_host;
347 using array_layout = typename view_type::array_layout;
348 using device_type = typename view_type::device_type;
349
350 auto X_lcl = X.getLocalViewHost(Access::ReadOnly);
351 normImpl<val_type, array_layout, device_type,
352 mag_type>(norms, X_lcl, whichNorm, whichVecs,
353 isConstantStride, isDistributed, comm);
354 } else {
355 using view_type = typename dual_view_type::t_dev;
356 using array_layout = typename view_type::array_layout;
357 using device_type = typename view_type::device_type;
358
359 auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
360 normImpl<val_type, array_layout, device_type,
361 mag_type>(norms, X_lcl, whichNorm, whichVecs,
362 isConstantStride, isDistributed, comm);
363 }
364}
365} // namespace
366
367namespace Tpetra {
368
369namespace Details {
370template <typename DstView, typename SrcView>
371struct AddAssignFunctor {
372 // This functor would be better as a lambda, but CUDA cannot compile
373 // lambdas in protected functions. It compiles fine with the functor.
374 AddAssignFunctor(DstView& tgt_, SrcView& src_)
375 : tgt(tgt_)
376 , src(src_) {}
377
379 operator()(const size_t k) const { tgt(k) += src(k); }
380
381 DstView tgt;
382 SrcView src;
383};
384} // namespace Details
385
386template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
388 vectorIndexOutOfRange(const size_t VectorIndex) const {
389 return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
390}
391
392template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
396
397template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
399 MultiVector(const Teuchos::RCP<const map_type>& map,
400 const size_t numVecs,
401 const bool zeroOut)
402 : /* default is true */
403 base_type(map) {
404 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,numVecs,zeroOut)");
405
406 const size_t lclNumRows = this->getLocalLength();
408}
409
410template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
413 const Teuchos::DataAccess copyOrView)
415 , view_(source.view_)
416 , whichVectors_(source.whichVectors_) {
418 const char tfecfFuncName[] =
419 "MultiVector(const MultiVector&, "
420 "const Teuchos::DataAccess): ";
421
422 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV 2-arg \"copy\" ctor");
423
424 if (copyOrView == Teuchos::Copy) {
425 // Reuse the conveniently already existing function that creates
426 // a deep copy.
427 MV cpy = createCopy(source);
428 this->view_ = cpy.view_;
429 this->whichVectors_ = cpy.whichVectors_;
430 } else if (copyOrView == Teuchos::View) {
431 } else {
433 true, std::invalid_argument,
434 "Second argument 'copyOrView' has an "
435 "invalid value "
436 << copyOrView << ". Valid values include "
437 "Teuchos::Copy = "
438 << Teuchos::Copy << " and Teuchos::View = "
439 << Teuchos::View << ".");
440 }
441}
442
443template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
445 MultiVector(const Teuchos::RCP<const map_type>& map,
446 const dual_view_type& view)
448 , view_(wrapped_dual_view_type(view)) {
449 const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
450 const size_t lclNumRows_map = map.is_null() ? size_t(0) : map->getLocalNumElements();
451 const size_t lclNumRows_view = view.extent(0);
452 const size_t LDA = getDualViewStride(view_);
453
455 std::invalid_argument,
456 "Kokkos::DualView does not match Map. "
457 "map->getLocalNumElements() = "
459 << ", view.extent(0) = " << lclNumRows_view
460 << ", and getStride() = " << LDA << ".");
462 using ::Tpetra::Details::Behavior;
463 const bool debug = Behavior::debug();
464 if (debug) {
465 using ::Tpetra::Details::checkGlobalDualViewValidity;
466 std::ostringstream gblErrStrm;
467 const bool verbose = Behavior::verbose();
468 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
469 const bool gblValid =
470 checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
471 comm.getRawPtr());
472 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
473 }
474}
475
476template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
478 MultiVector(const Teuchos::RCP<const map_type>& map,
480 : base_type(map)
481 , view_(view) {
482 const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
483 const size_t lclNumRows_map = map.is_null() ? size_t(0) : map->getLocalNumElements();
484 const size_t lclNumRows_view = view.extent(0);
485 const size_t LDA = getDualViewStride(view);
486
488 std::invalid_argument,
489 "Kokkos::DualView does not match Map. "
490 "map->getLocalNumElements() = "
492 << ", view.extent(0) = " << lclNumRows_view
493 << ", and getStride() = " << LDA << ".");
494
495 using ::Tpetra::Details::Behavior;
496 const bool debug = Behavior::debug();
497 if (debug) {
498 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
499 std::ostringstream gblErrStrm;
500 const bool verbose = Behavior::verbose();
501 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
502 const bool gblValid =
503 checkGlobalWrappedDualViewValidity(&gblErrStrm, view, verbose,
504 comm.getRawPtr());
505 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
506 }
507}
508
509template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
511 MultiVector(const Teuchos::RCP<const map_type>& map,
512 const typename dual_view_type::t_dev& d_view)
514 using Teuchos::ArrayRCP;
515 using Teuchos::RCP;
516 const char tfecfFuncName[] = "MultiVector(map,d_view): ";
517
518 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,d_view)");
519
520 const size_t LDA = getViewStride(d_view);
521 const size_t lclNumRows = map->getLocalNumElements();
523 "Map does not match "
524 "Kokkos::View. map->getLocalNumElements() = "
525 << lclNumRows
526 << ", View's column stride = " << LDA
527 << ", and View's extent(0) = " << d_view.extent(0) << ".");
528
529 auto h_view = Kokkos::create_mirror_view(d_view);
532
533 using ::Tpetra::Details::Behavior;
534 const bool debug = Behavior::debug();
535 if (debug) {
536 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
537 std::ostringstream gblErrStrm;
538 const bool verbose = Behavior::verbose();
539 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
540 const bool gblValid =
541 checkGlobalWrappedDualViewValidity(&gblErrStrm, view_, verbose,
542 comm.getRawPtr());
543 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
544 }
545 // The user gave us a device view. In order to respect its
546 // initial contents, we mark the DualView as "modified on device."
547 // That way, the next sync will synchronize it with the host view.
548 dual_view.modify_device();
549}
550
551template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
553 MultiVector(const Teuchos::RCP<const map_type>& map,
554 const dual_view_type& view,
556 : base_type(map)
557 , view_(wrapped_dual_view_type(view, origView)) {
558 const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
559
560 const size_t LDA = getDualViewStride(origView);
561 const size_t lclNumRows = this->getLocalLength(); // comes from the Map
563 LDA < lclNumRows, std::invalid_argument,
564 "The input Kokkos::DualView's "
565 "column stride LDA = "
566 << LDA << " < getLocalLength() = " << lclNumRows
567 << ". This may also mean that the input origView's first dimension (number "
568 "of rows = "
569 << origView.extent(0) << ") does not not match the number "
570 "of entries in the Map on the calling process.");
571
572 using ::Tpetra::Details::Behavior;
573 const bool debug = Behavior::debug();
574 if (debug) {
575 using ::Tpetra::Details::checkGlobalDualViewValidity;
576 std::ostringstream gblErrStrm;
577 const bool verbose = Behavior::verbose();
578 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
579 const bool gblValid_0 =
580 checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
581 comm.getRawPtr());
582 const bool gblValid_1 =
583 checkGlobalDualViewValidity(&gblErrStrm, origView, verbose,
584 comm.getRawPtr());
585 const bool gblValid = gblValid_0 && gblValid_1;
586 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
587 }
588}
589
590template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
592 MultiVector(const Teuchos::RCP<const map_type>& map,
593 const dual_view_type& view,
594 const Teuchos::ArrayView<const size_t>& whichVectors)
595 : base_type(map)
596 , view_(view)
597 , whichVectors_(whichVectors.begin(), whichVectors.end()) {
598 using Kokkos::ALL;
599 using Kokkos::subview;
600 const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
601
602 using ::Tpetra::Details::Behavior;
603 const bool debug = Behavior::debug();
604 if (debug) {
605 using ::Tpetra::Details::checkGlobalDualViewValidity;
606 std::ostringstream gblErrStrm;
607 const bool verbose = Behavior::verbose();
608 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
609 const bool gblValid =
610 checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
611 comm.getRawPtr());
612 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
613 }
614
615 const size_t lclNumRows = map.is_null() ? size_t(0) : map->getLocalNumElements();
616 // Check dimensions of the input DualView. We accept that Kokkos
617 // might not allow construction of a 0 x m (Dual)View with m > 0,
618 // so we only require the number of rows to match if the
619 // (Dual)View has more than zero columns. Likewise, we only
620 // require the number of columns to match if the (Dual)View has
621 // more than zero rows.
623 view.extent(1) != 0 && static_cast<size_t>(view.extent(0)) < lclNumRows,
624 std::invalid_argument, "view.extent(0) = " << view.extent(0) << " < map->getLocalNumElements() = " << lclNumRows << ".");
625 if (whichVectors.size() != 0) {
627 view.extent(1) != 0 && view.extent(1) == 0,
628 std::invalid_argument,
629 "view.extent(1) = 0, but whichVectors.size()"
630 " = "
631 << whichVectors.size() << " > 0.");
632 size_t maxColInd = 0;
633 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
634 for (size_type k = 0; k < whichVectors.size(); ++k) {
636 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid(),
637 std::invalid_argument, "whichVectors[" << k << "] = "
638 "Teuchos::OrdinalTraits<size_t>::invalid().");
639 maxColInd = std::max(maxColInd, whichVectors[k]);
640 }
642 view.extent(1) != 0 && static_cast<size_t>(view.extent(1)) <= maxColInd,
643 std::invalid_argument, "view.extent(1) = " << view.extent(1) << " <= max(whichVectors) = " << maxColInd << ".");
644 }
645
646 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
647 // zero strides, so modify in that case.
648 const size_t LDA = getDualViewStride(view);
650 "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
651
652 if (whichVectors.size() == 1) {
653 // If whichVectors has only one entry, we don't need to bother
654 // with nonconstant stride. Just shift the view over so it
655 // points to the desired column.
656 //
657 // NOTE (mfh 10 May 2014) This is a special case where we set
658 // origView_ just to view that one column, not all of the
659 // original columns. This ensures that the use of origView_ in
660 // offsetView works correctly.
661 //
662 const std::pair<size_t, size_t> colRng(whichVectors[0],
663 whichVectors[0] + 1);
665 // whichVectors_.size() == 0 means "constant stride."
666 whichVectors_.clear();
667 }
668}
669
670template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
672 MultiVector(const Teuchos::RCP<const map_type>& map,
673 const wrapped_dual_view_type& view,
674 const Teuchos::ArrayView<const size_t>& whichVectors)
675 : base_type(map)
676 , view_(view)
677 , whichVectors_(whichVectors.begin(), whichVectors.end()) {
678 using Kokkos::ALL;
679 using Kokkos::subview;
680 const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
681
682 using ::Tpetra::Details::Behavior;
683 const bool debug = Behavior::debug();
684 if (debug) {
685 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
686 std::ostringstream gblErrStrm;
687 const bool verbose = Behavior::verbose();
688 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
689 const bool gblValid =
690 checkGlobalWrappedDualViewValidity(&gblErrStrm, view, verbose,
691 comm.getRawPtr());
692 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
693 }
694
695 const size_t lclNumRows = map.is_null() ? size_t(0) : map->getLocalNumElements();
696 // Check dimensions of the input DualView. We accept that Kokkos
697 // might not allow construction of a 0 x m (Dual)View with m > 0,
698 // so we only require the number of rows to match if the
699 // (Dual)View has more than zero columns. Likewise, we only
700 // require the number of columns to match if the (Dual)View has
701 // more than zero rows.
703 view.extent(1) != 0 && static_cast<size_t>(view.extent(0)) < lclNumRows,
704 std::invalid_argument, "view.extent(0) = " << view.extent(0) << " < map->getLocalNumElements() = " << lclNumRows << ".");
705 if (whichVectors.size() != 0) {
707 view.extent(1) != 0 && view.extent(1) == 0,
708 std::invalid_argument,
709 "view.extent(1) = 0, but whichVectors.size()"
710 " = "
711 << whichVectors.size() << " > 0.");
712 size_t maxColInd = 0;
713 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
714 for (size_type k = 0; k < whichVectors.size(); ++k) {
716 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid(),
717 std::invalid_argument, "whichVectors[" << k << "] = "
718 "Teuchos::OrdinalTraits<size_t>::invalid().");
719 maxColInd = std::max(maxColInd, whichVectors[k]);
720 }
722 view.extent(1) != 0 && static_cast<size_t>(view.extent(1)) <= maxColInd,
723 std::invalid_argument, "view.extent(1) = " << view.extent(1) << " <= max(whichVectors) = " << maxColInd << ".");
724 }
725
726 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
727 // zero strides, so modify in that case.
728 const size_t LDA = getDualViewStride(view);
730 "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
731
732 if (whichVectors.size() == 1) {
733 // If whichVectors has only one entry, we don't need to bother
734 // with nonconstant stride. Just shift the view over so it
735 // points to the desired column.
736 //
737 // NOTE (mfh 10 May 2014) This is a special case where we set
738 // origView_ just to view that one column, not all of the
739 // original columns. This ensures that the use of origView_ in
740 // offsetView works correctly.
741 //
742 const std::pair<size_t, size_t> colRng(whichVectors[0],
743 whichVectors[0] + 1);
744 view_ = takeSubview(view_, ALL(), colRng);
745 // whichVectors_.size() == 0 means "constant stride."
746 whichVectors_.clear();
747 }
748}
749
750template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
752 MultiVector(const Teuchos::RCP<const map_type>& map,
753 const dual_view_type& view,
755 const Teuchos::ArrayView<const size_t>& whichVectors)
756 : base_type(map)
757 , view_(wrapped_dual_view_type(view, origView))
758 , whichVectors_(whichVectors.begin(), whichVectors.end()) {
759 using Kokkos::ALL;
760 using Kokkos::subview;
761 const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
762
763 using ::Tpetra::Details::Behavior;
764 const bool debug = Behavior::debug();
765 if (debug) {
766 using ::Tpetra::Details::checkGlobalDualViewValidity;
767 std::ostringstream gblErrStrm;
768 const bool verbose = Behavior::verbose();
769 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
770 const bool gblValid_0 =
771 checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
772 comm.getRawPtr());
773 const bool gblValid_1 =
774 checkGlobalDualViewValidity(&gblErrStrm, origView, verbose,
775 comm.getRawPtr());
776 const bool gblValid = gblValid_0 && gblValid_1;
777 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
778 }
779
780 const size_t lclNumRows = this->getLocalLength();
781 // Check dimensions of the input DualView. We accept that Kokkos
782 // might not allow construction of a 0 x m (Dual)View with m > 0,
783 // so we only require the number of rows to match if the
784 // (Dual)View has more than zero columns. Likewise, we only
785 // require the number of columns to match if the (Dual)View has
786 // more than zero rows.
788 view.extent(1) != 0 && static_cast<size_t>(view.extent(0)) < lclNumRows,
789 std::invalid_argument, "view.extent(0) = " << view.extent(0) << " < map->getLocalNumElements() = " << lclNumRows << ".");
790 if (whichVectors.size() != 0) {
792 view.extent(1) != 0 && view.extent(1) == 0,
793 std::invalid_argument,
794 "view.extent(1) = 0, but whichVectors.size()"
795 " = "
796 << whichVectors.size() << " > 0.");
797 size_t maxColInd = 0;
798 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
799 for (size_type k = 0; k < whichVectors.size(); ++k) {
801 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid(),
802 std::invalid_argument, "whichVectors[" << k << "] = "
803 "Teuchos::OrdinalTraits<size_t>::invalid().");
804 maxColInd = std::max(maxColInd, whichVectors[k]);
805 }
807 view.extent(1) != 0 && static_cast<size_t>(view.extent(1)) <= maxColInd,
808 std::invalid_argument, "view.extent(1) = " << view.extent(1) << " <= max(whichVectors) = " << maxColInd << ".");
809 }
810
811 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
812 // zero strides, so modify in that case.
813 const size_t LDA = getDualViewStride(origView);
815 "Map and DualView origView "
816 "do not match. LDA = "
817 << LDA << " < this->getLocalLength() = " << lclNumRows << ". origView.extent(0) = " << origView.extent(0)
818 << ", origView.stride(1) = " << origView.view_device().stride(1) << ".");
819
820 if (whichVectors.size() == 1) {
821 // If whichVectors has only one entry, we don't need to bother
822 // with nonconstant stride. Just shift the view over so it
823 // points to the desired column.
824 //
825 // NOTE (mfh 10 May 2014) This is a special case where we set
826 // origView_ just to view that one column, not all of the
827 // original columns. This ensures that the use of origView_ in
828 // offsetView works correctly.
829 const std::pair<size_t, size_t> colRng(whichVectors[0],
830 whichVectors[0] + 1);
832 // origView_ = takeSubview (origView_, ALL (), colRng);
833 // whichVectors_.size() == 0 means "constant stride."
834 whichVectors_.clear();
835 }
836}
838template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
840 MultiVector(const Teuchos::RCP<const map_type>& map,
841 const Teuchos::ArrayView<const Scalar>& data,
842 const size_t LDA,
843 const size_t numVecs)
844 : base_type(map) {
845 typedef LocalOrdinal LO;
846 typedef GlobalOrdinal GO;
847 const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
848 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
849
850 // Deep copy constructor, constant stride (NO whichVectors_).
851 // There is no need for a deep copy constructor with nonconstant stride.
852
853 const size_t lclNumRows =
854 map.is_null() ? size_t(0) : map->getLocalNumElements();
855 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
856 "map->getLocalNumElements() = "
857 << lclNumRows << ".");
858 if (numVecs != 0) {
859 const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
860 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(data.size()) < minNumEntries,
861 std::invalid_argument,
862 "Input Teuchos::ArrayView does not have enough "
863 "entries, given the input Map and number of vectors in the MultiVector."
864 " data.size() = "
865 << data.size() << " < (LDA*(numVecs-1)) + "
866 "map->getLocalNumElements () = "
867 << minNumEntries << ".");
868 }
869
871 // Note: X_out will be completely overwritten
872 auto X_out = this->getLocalViewDevice(Access::OverwriteAll);
873
874 // Make an unmanaged host Kokkos::View of the input data. First
875 // create a View (X_in_orig) with the original stride. Then,
876 // create a subview (X_in) with the right number of columns.
877 const impl_scalar_type* const X_in_raw =
878 reinterpret_cast<const impl_scalar_type*>(data.getRawPtr());
879 Kokkos::View<const impl_scalar_type**,
880 Kokkos::LayoutLeft,
881 Kokkos::HostSpace,
882 Kokkos::MemoryUnmanaged>
884 const Kokkos::pair<size_t, size_t> rowRng(0, lclNumRows);
885 auto X_in = Kokkos::subview(X_in_orig, rowRng, Kokkos::ALL());
886
887 // If LDA != X_out's column stride, then we need to copy one
888 // column at a time; Kokkos::deep_copy refuses to work in that
889 // case.
890 const size_t outStride =
891 X_out.extent(1) == 0 ? size_t(1) : X_out.stride(1);
892 if (LDA == outStride) { // strides are the same; deep_copy once
893 // This only works because MultiVector uses LayoutLeft.
894 // We would need a custom copy functor otherwise.
895 // DEEP_COPY REVIEW - HOST-TO-DEVICE
896 Kokkos::deep_copy(execution_space(), X_out, X_in);
897 } else { // strides differ; copy one column at a time
898 typedef decltype(Kokkos::subview(X_out, Kokkos::ALL(), 0))
900 typedef decltype(Kokkos::subview(X_in, Kokkos::ALL(), 0))
902 for (size_t j = 0; j < numVecs; ++j) {
903 out_col_view_type X_out_j = Kokkos::subview(X_out, Kokkos::ALL(), j);
904 in_col_view_type X_in_j = Kokkos::subview(X_in, Kokkos::ALL(), j);
905 // DEEP_COPY REVIEW - HOST-TO-DEVICE
906 Kokkos::deep_copy(execution_space(), X_out_j, X_in_j);
907 }
908 }
909}
910
911template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
913 MultiVector(const Teuchos::RCP<const map_type>& map,
914 const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
915 const size_t numVecs)
916 : base_type(map) {
917 typedef impl_scalar_type IST;
918 typedef LocalOrdinal LO;
919 typedef GlobalOrdinal GO;
920 const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
921 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
922
923 const size_t lclNumRows =
924 map.is_null() ? size_t(0) : map->getLocalNumElements();
925 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numVecs < 1 || numVecs != static_cast<size_t>(ArrayOfPtrs.size()),
926 std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
927 "ArrayOfPtrs.size() (= "
928 << ArrayOfPtrs.size() << ") != numVecs.");
929 for (size_t j = 0; j < numVecs; ++j) {
930 Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
932 static_cast<size_t>(X_j_av.size()) < lclNumRows,
933 std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = " << X_j_av.size() << " < map->getLocalNumElements() = " << lclNumRows << ".");
934 }
935
937 auto X_out = this->getLocalViewDevice(Access::ReadWrite);
938
939 // Make sure that the type of a single input column has the same
940 // array layout as each output column does, so we can deep_copy.
941 using array_layout = typename decltype(X_out)::array_layout;
942 using input_col_view_type = typename Kokkos::View<const IST*,
943 array_layout,
944 Kokkos::HostSpace,
945 Kokkos::MemoryUnmanaged>;
946
947 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
948 for (size_t j = 0; j < numVecs; ++j) {
949 Teuchos::ArrayView<const IST> X_j_av =
950 Teuchos::av_reinterpret_cast<const IST>(ArrayOfPtrs[j]);
952 auto X_j_out = Kokkos::subview(X_out, rowRng, j);
953 // DEEP_COPY REVIEW - HOST-TO-DEVICE
954 Kokkos::deep_copy(execution_space(), X_j_out, X_j_in);
955 }
956}
957
958template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
960 isConstantStride() const {
961 return whichVectors_.empty();
962}
963
964template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
965size_t
967 getLocalLength() const {
968 if (this->getMap().is_null()) { // possible, due to replaceMap().
969 return static_cast<size_t>(0);
970 } else {
971 return this->getMap()->getLocalNumElements();
972 }
973}
974
975template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
979 if (this->getMap().is_null()) { // possible, due to replaceMap().
980 return static_cast<size_t>(0);
981 } else {
982 return this->getMap()->getGlobalNumElements();
983 }
984}
985
986template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
987size_t
989 getStride() const {
990 return isConstantStride() ? getDualViewStride(view_) : size_t(0);
991}
992
993template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
996 // Don't actually get a view, just get pointers.
997 auto thisData = view_.getDualView().view_host().data();
998 auto otherData = other.view_.getDualView().view_host().data();
999 size_t thisSpan = view_.getDualView().view_host().span();
1000 size_t otherSpan = other.view_.getDualView().view_host().span();
1002}
1003
1004template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1007 // Check whether the source object is a MultiVector. If not, then
1008 // we can't even compare sizes, so it's definitely not OK to
1009 // Import or Export from it.
1011 const MV* src = dynamic_cast<const MV*>(&sourceObj);
1012 if (src == nullptr) {
1013 return false;
1014 } else {
1015 // The target of the Import or Export calls checkSizes() in
1016 // DistObject::doTransfer(). By that point, we've already
1017 // constructed an Import or Export object using the two
1018 // multivectors' Maps, which means that (hopefully) we've
1019 // already checked other attributes of the multivectors. Thus,
1020 // all we need to do here is check the number of columns.
1021 return src->getNumVectors() == this->getNumVectors();
1022 }
1023}
1024
1025template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1026size_t
1029 return this->getNumVectors();
1030}
1031
1032template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1035 const size_t numSameIDs,
1036 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1037 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1038 const CombineMode CM,
1039 const execution_space& space) {
1040 using Kokkos::Compat::create_const_view;
1041 using KokkosRefactor::Details::permute_array_multi_column;
1042 using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
1043 using std::endl;
1044 using ::Tpetra::Details::Behavior;
1045 using ::Tpetra::Details::getDualViewCopyFromArrayView;
1046 using ::Tpetra::Details::ProfilingRegion;
1049 // We've already called checkSizes(), so this cast must succeed.
1050 MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&>(sourceObj));
1051
1052 bool copyOnHost;
1053 if (importsAreAliased() && (this->constantNumberOfPackets() != 0) && Behavior::enableGranularTransfers()) {
1054 // imports are aliased to the target. We already posted Irecvs
1055 // into imports using a memory space that depends on GPU
1056 // awareness. Therefore we want to modify target in the same
1057 // memory space. See comments in DistObject::beginTransfer.
1058 copyOnHost = !Behavior::assumeMpiIsGPUAware();
1059 } else {
1060 // We are free to choose the better memory space for copyAndPermute.
1062 }
1063 const char longFuncNameHost[] = "Tpetra::MultiVector::copyAndPermute[Host]";
1064 const char longFuncNameDevice[] = "Tpetra::MultiVector::copyAndPermute[Device]";
1065 const char tfecfFuncName[] = "copyAndPermute: ";
1067
1068 const bool verbose = Behavior::verbose();
1069 std::unique_ptr<std::string> prefix;
1070 if (verbose) {
1071 auto map = this->getMap();
1072 auto comm = map.is_null() ? Teuchos::null : map->getComm();
1073 const int myRank = comm.is_null() ? -1 : comm->getRank();
1074 std::ostringstream os;
1075 os << "Proc " << myRank << ": MV::copyAndPermute: ";
1076 prefix = std::unique_ptr<std::string>(new std::string(os.str()));
1077 os << "Start" << endl;
1078 std::cerr << os.str();
1079 }
1080
1082 std::logic_error, "permuteToLIDs.extent(0) = " << permuteToLIDs.extent(0) << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0) << ".");
1083 const size_t numCols = this->getNumVectors();
1084
1085 // sourceMV doesn't belong to us, so we can't sync it. Do the
1086 // copying where it's currently sync'd.
1087 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(sourceMV.need_sync_device() && sourceMV.need_sync_host(),
1088 std::logic_error,
1089 "Input MultiVector needs sync to both host "
1090 "and device.");
1091 if (verbose) {
1092 std::ostringstream os;
1093 os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1094 std::cerr << os.str();
1095 }
1096
1097 if (verbose) {
1098 std::ostringstream os;
1099 os << *prefix << "Copy first " << numSameIDs << " elements" << endl;
1100 std::cerr << os.str();
1101 }
1102
1103 // TODO (mfh 15 Sep 2013) When we replace
1104 // KokkosClassic::MultiVector with a Kokkos::View, there are two
1105 // ways to copy the data:
1106 //
1107 // 1. Get a (sub)view of each column and call deep_copy on that.
1108 // 2. Write a custom kernel to copy the data.
1109 //
1110 // The first is easier, but the second might be more performant in
1111 // case we decide to use layouts other than LayoutLeft. It might
1112 // even make sense to hide whichVectors_ in an entirely new layout
1113 // for Kokkos Views.
1114
1115 // Copy rows [0, numSameIDs-1] of the local multivectors.
1116 //
1117 // Note (ETP 2 Jul 2014) We need to always copy one column at a
1118 // time, even when both multivectors are constant-stride, since
1119 // deep_copy between strided subviews with more than one column
1120 // doesn't currently work.
1121
1122 // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1123 // both source and target are constant stride and have multiple
1124 // columns.
1125 if (numSameIDs > 0) {
1126 const std::pair<size_t, size_t> rows(0, numSameIDs);
1127 if (copyOnHost) {
1128 ProfilingRegion regionC("Tpetra::MultiVector::copy[Host]");
1129 auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1130 auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1131
1132 for (size_t j = 0; j < numCols; ++j) {
1133 const size_t tgtCol = isConstantStride() ? j : whichVectors_[j];
1134 const size_t srcCol =
1135 sourceMV.isConstantStride() ? j : sourceMV.whichVectors_[j];
1136
1137 auto tgt_j = Kokkos::subview(tgt_h, rows, tgtCol);
1138 auto src_j = Kokkos::subview(src_h, rows, srcCol);
1139 if (CM == ADD_ASSIGN) {
1140 // Sum src_j into tgt_j
1141 using range_t =
1142 Kokkos::RangePolicy<execution_space, size_t>;
1143 range_t rp(space, 0, numSameIDs);
1144 Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1145 aaf(tgt_j, src_j);
1146 Kokkos::parallel_for(rp, aaf);
1147 } else {
1148 // Copy src_j into tgt_j
1149 // DEEP_COPY REVIEW - HOSTMIRROR-TO-HOSTMIRROR
1150 Kokkos::deep_copy(space, tgt_j, src_j);
1151 space.fence("Tpetra::MultiVector::copyAndPermute-1");
1152 }
1153 }
1154 } else { // copy on device
1155 ProfilingRegion regionC("Tpetra::MultiVector::copy[Device]");
1156 auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1157 auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1158
1159 for (size_t j = 0; j < numCols; ++j) {
1160 const size_t tgtCol = isConstantStride() ? j : whichVectors_[j];
1161 const size_t srcCol =
1162 sourceMV.isConstantStride() ? j : sourceMV.whichVectors_[j];
1163
1164 auto tgt_j = Kokkos::subview(tgt_d, rows, tgtCol);
1165 auto src_j = Kokkos::subview(src_d, rows, srcCol);
1166 if (CM == ADD_ASSIGN) {
1167 // Sum src_j into tgt_j
1168 using range_t =
1169 Kokkos::RangePolicy<execution_space, size_t>;
1170 range_t rp(space, 0, numSameIDs);
1171 Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1172 aaf(tgt_j, src_j);
1173 Kokkos::parallel_for(rp, aaf);
1174 } else {
1175 // Copy src_j into tgt_j
1176 // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
1177 Kokkos::deep_copy(space, tgt_j, src_j);
1178 space.fence("Tpetra::MultiVector::copyAndPermute-2");
1179 }
1180 }
1181 }
1182 }
1183
1184 // For the remaining GIDs, execute the permutations. This may
1185 // involve noncontiguous access of both source and destination
1186 // vectors, depending on the LID lists.
1187 //
1188 // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1189 // the same process, this merges their values by replacement of
1190 // the last encountered GID, not by the specified merge rule
1191 // (such as ADD).
1192
1193 // If there are no permutations, we are done
1194 if (permuteFromLIDs.extent(0) == 0 ||
1195 permuteToLIDs.extent(0) == 0) {
1196 if (verbose) {
1197 std::ostringstream os;
1198 os << *prefix << "No permutations. Done!" << endl;
1199 std::cerr << os.str();
1200 }
1201 return;
1202 }
1203 ProfilingRegion regionP("Tpetra::MultiVector::permute");
1204
1205 if (verbose) {
1206 std::ostringstream os;
1207 os << *prefix << "Permute" << endl;
1208 std::cerr << os.str();
1209 }
1210
1211 // We could in theory optimize for the case where exactly one of
1212 // them is constant stride, but we don't currently do that.
1213 const bool nonConstStride =
1214 !this->isConstantStride() || !sourceMV.isConstantStride();
1215
1216 if (verbose) {
1217 std::ostringstream os;
1218 os << *prefix << "nonConstStride="
1219 << (nonConstStride ? "true" : "false") << endl;
1220 std::cerr << os.str();
1221 }
1222
1223 // We only need the "which vectors" arrays if either the source or
1224 // target MV is not constant stride. Since we only have one
1225 // kernel that must do double-duty, we have to create a "which
1226 // vectors" array for the MV that _is_ constant stride.
1227 Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1228 Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1229 if (nonConstStride) {
1230 if (this->whichVectors_.size() == 0) {
1231 Kokkos::DualView<size_t*, device_type> tmpTgt("tgtWhichVecs", numCols);
1232 tmpTgt.modify_host();
1233 for (size_t j = 0; j < numCols; ++j) {
1234 tmpTgt.view_host()(j) = j;
1235 }
1236 if (!copyOnHost) {
1237 tmpTgt.sync_device();
1238 }
1240 } else {
1241 Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_();
1242 tgtWhichVecs =
1244 "tgtWhichVecs",
1245 copyOnHost);
1246 }
1247 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(tgtWhichVecs.extent(0)) !=
1248 this->getNumVectors(),
1249 std::logic_error, "tgtWhichVecs.extent(0) = " << tgtWhichVecs.extent(0) << " != this->getNumVectors() = " << this->getNumVectors() << ".");
1250
1251 if (sourceMV.whichVectors_.size() == 0) {
1252 Kokkos::DualView<size_t*, device_type> tmpSrc("srcWhichVecs", numCols);
1253 tmpSrc.modify_host();
1254 for (size_t j = 0; j < numCols; ++j) {
1255 tmpSrc.view_host()(j) = j;
1257 if (!copyOnHost) {
1258 tmpSrc.sync_device();
1259 }
1261 } else {
1262 Teuchos::ArrayView<const size_t> srcWhichVecsT =
1263 sourceMV.whichVectors_();
1264 srcWhichVecs =
1266 "srcWhichVecs",
1267 copyOnHost);
1268 }
1269 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(srcWhichVecs.extent(0)) !=
1270 sourceMV.getNumVectors(),
1271 std::logic_error,
1272 "srcWhichVecs.extent(0) = " << srcWhichVecs.extent(0)
1273 << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors()
1274 << ".");
1275 }
1276
1277 if (copyOnHost) { // permute on host too
1278 if (verbose) {
1279 std::ostringstream os;
1280 os << *prefix << "Get permute LIDs on host" << std::endl;
1281 std::cerr << os.str();
1282 }
1283 auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1284 auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1285
1286 TEUCHOS_ASSERT(!permuteToLIDs.need_sync_host());
1288 TEUCHOS_ASSERT(!permuteFromLIDs.need_sync_host());
1289 auto permuteFromLIDs_h =
1291
1292 if (verbose) {
1293 std::ostringstream os;
1294 os << *prefix << "Permute on host" << endl;
1295 std::cerr << os.str();
1296 }
1297 if (nonConstStride) {
1298 // No need to sync first, because copyOnHost argument to
1299 // getDualViewCopyFromArrayView puts them in the right place.
1300 auto tgtWhichVecs_h =
1301 create_const_view(tgtWhichVecs.view_host());
1302 auto srcWhichVecs_h =
1303 create_const_view(srcWhichVecs.view_host());
1304 if (CM == ADD_ASSIGN) {
1305 using op_type = KokkosRefactor::Details::AddOp;
1306 permute_array_multi_column_variable_stride(tgt_h, src_h,
1310 srcWhichVecs_h, numCols,
1311 op_type());
1312 } else {
1313 using op_type = KokkosRefactor::Details::InsertOp;
1314 permute_array_multi_column_variable_stride(tgt_h, src_h,
1318 srcWhichVecs_h, numCols,
1319 op_type());
1320 }
1321 } else {
1322 if (CM == ADD_ASSIGN) {
1323 using op_type = KokkosRefactor::Details::AddOp;
1324 permute_array_multi_column(tgt_h, src_h, permuteToLIDs_h,
1325 permuteFromLIDs_h, numCols, op_type());
1326 } else {
1327 using op_type = KokkosRefactor::Details::InsertOp;
1328 permute_array_multi_column(tgt_h, src_h, permuteToLIDs_h,
1329 permuteFromLIDs_h, numCols, op_type());
1330 }
1331 }
1332 } else { // permute on device
1333 if (verbose) {
1334 std::ostringstream os;
1335 os << *prefix << "Get permute LIDs on device" << endl;
1336 std::cerr << os.str();
1337 }
1338 auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1339 auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1340
1341 TEUCHOS_ASSERT(!permuteToLIDs.need_sync_device());
1342 auto permuteToLIDs_d = create_const_view(permuteToLIDs.view_device());
1343 TEUCHOS_ASSERT(!permuteFromLIDs.need_sync_device());
1344 auto permuteFromLIDs_d =
1345 create_const_view(permuteFromLIDs.view_device());
1347 if (verbose) {
1348 std::ostringstream os;
1349 os << *prefix << "Permute on device" << endl;
1350 std::cerr << os.str();
1352 if (nonConstStride) {
1353 // No need to sync first, because copyOnHost argument to
1354 // getDualViewCopyFromArrayView puts them in the right place.
1356 auto srcWhichVecs_d = create_const_view(srcWhichVecs.view_device());
1357 if (CM == ADD_ASSIGN) {
1358 using op_type = KokkosRefactor::Details::AddOp;
1359 permute_array_multi_column_variable_stride(space, tgt_d, src_d,
1363 srcWhichVecs_d, numCols,
1364 op_type());
1365 } else {
1366 using op_type = KokkosRefactor::Details::InsertOp;
1367 permute_array_multi_column_variable_stride(space, tgt_d, src_d,
1372 op_type());
1373 }
1374 } else {
1375 if (CM == ADD_ASSIGN) {
1376 using op_type = KokkosRefactor::Details::AddOp;
1377 permute_array_multi_column(space, tgt_d, src_d, permuteToLIDs_d,
1378 permuteFromLIDs_d, numCols, op_type());
1379 } else {
1380 using op_type = KokkosRefactor::Details::InsertOp;
1381 permute_array_multi_column(space, tgt_d, src_d, permuteToLIDs_d,
1382 permuteFromLIDs_d, numCols, op_type());
1383 }
1384 }
1385 }
1386
1387 if (verbose) {
1388 std::ostringstream os;
1389 os << *prefix << "Done!" << endl;
1390 std::cerr << os.str();
1392}
1393
1394template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1396 const SrcDistObject& sourceObj, const size_t numSameIDs,
1397 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1398 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1399 const CombineMode CM) {
1400 copyAndPermute(sourceObj, numSameIDs, permuteToLIDs, permuteFromLIDs, CM,
1402}
1403
1404template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1407 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1408 Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1409 Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1410 size_t& constantNumPackets,
1411 const execution_space& space) {
1412 using Kokkos::Compat::create_const_view;
1413 using Kokkos::Compat::getKokkosViewDeepCopy;
1414 using std::endl;
1415 using ::Tpetra::Details::Behavior;
1416 using ::Tpetra::Details::ProfilingRegion;
1417 using ::Tpetra::Details::reallocDualViewIfNeeded;
1419
1420 // We've already called checkSizes(), so this cast must succeed.
1421 MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&>(sourceObj));
1422
1423 const bool packOnHost = runKernelOnHost(sourceMV);
1424 const char longFuncNameHost[] = "Tpetra::MultiVector::packAndPrepare[Host]";
1425 const char longFuncNameDevice[] = "Tpetra::MultiVector::packAndPrepare[Device]";
1426 const char tfecfFuncName[] = "packAndPrepare: ";
1428
1429 // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1430 // have the option to check indices. We do so when Tpetra is in
1431 // debug mode. It is in debug mode by default in a debug build,
1432 // but you may control this at run time, before launching the
1433 // executable, by setting the TPETRA_DEBUG environment variable to
1434 // "1" (or "TRUE").
1435 const bool debugCheckIndices = Behavior::debug();
1436 // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1437 // environment variable to "1" (or "TRUE") for copious debug
1438 // output to std::cerr on every MPI process. This is unwise for
1439 // runs with large numbers of MPI processes.
1440 const bool printDebugOutput = Behavior::verbose();
1441
1442 std::unique_ptr<std::string> prefix;
1444 auto map = this->getMap();
1445 auto comm = map.is_null() ? Teuchos::null : map->getComm();
1446 const int myRank = comm.is_null() ? -1 : comm->getRank();
1447 std::ostringstream os;
1448 os << "Proc " << myRank << ": MV::packAndPrepare: ";
1449 prefix = std::unique_ptr<std::string>(new std::string(os.str()));
1450 os << "Start" << endl;
1451 std::cerr << os.str();
1452 }
1453
1454 const size_t numCols = sourceMV.getNumVectors();
1455
1456 // This spares us from needing to fill numExportPacketsPerLID.
1457 // Setting constantNumPackets to a nonzero value signals that
1458 // all packets have the same number of entries.
1459 constantNumPackets = numCols;
1460
1461 // If we have no exports, there is nothing to do. Make sure this
1462 // goes _after_ setting constantNumPackets correctly.
1463 if (exportLIDs.extent(0) == 0) {
1464 if (printDebugOutput) {
1465 std::ostringstream os;
1466 os << *prefix << "No exports on this proc, DONE" << std::endl;
1467 std::cerr << os.str();
1468 }
1469 return;
1470 }
1471
1472 /* The layout in the export for MultiVectors is as follows:
1473 exports = { all of the data from row exportLIDs.front() ;
1474 ....
1475 all of the data from row exportLIDs.back() }
1476 This doesn't have the best locality, but is necessary because
1477 the data for a Packet (all data associated with an LID) is
1478 required to be contiguous. */
1479
1480 // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1481 // packing scheme in the above comment? The data going to a
1482 // particular process must be contiguous, of course, but those
1483 // data could include entries from multiple LIDs. DistObject just
1484 // needs to know how to index into that data. Kokkos is good at
1485 // decoupling storage intent from data layout choice.
1486
1487 const size_t numExportLIDs = exportLIDs.extent(0);
1488 const size_t newExportsSize = numCols * numExportLIDs;
1489 if (printDebugOutput) {
1490 std::ostringstream os;
1491 os << *prefix << "realloc: "
1492 << "numExportLIDs: " << numExportLIDs
1493 << ", exports.extent(0): " << exports.extent(0)
1494 << ", newExportsSize: " << newExportsSize << std::endl;
1495 std::cerr << os.str();
1496 }
1497 reallocDualViewIfNeeded(exports, newExportsSize, "exports");
1498
1499 // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1500 // sync it. Pack it where it's currently sync'd.
1501 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(sourceMV.need_sync_device() && sourceMV.need_sync_host(),
1502 std::logic_error,
1503 "Input MultiVector needs sync to both host "
1504 "and device.");
1505 if (printDebugOutput) {
1506 std::ostringstream os;
1507 os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1508 std::cerr << os.str();
1510
1511 // Mark 'exports' here, since we might have resized it above.
1512 // Resizing currently requires calling the constructor, which
1513 // clears out the 'modified' flags.
1514 if (packOnHost) {
1515 // nde 06 Feb 2020: If 'exports' does not require resize
1516 // when reallocDualViewIfNeeded is called, the modified flags
1517 // are not cleared out. This can result in host and device views
1518 // being out-of-sync, resuling in an error in exports.modify_* calls.
1519 // Clearing the sync flags prevents this possible case.
1520 exports.clear_sync_state();
1521 exports.modify_host();
1522 } else {
1523 // nde 06 Feb 2020: If 'exports' does not require resize
1524 // when reallocDualViewIfNeeded is called, the modified flags
1525 // are not cleared out. This can result in host and device views
1526 // being out-of-sync, resuling in an error in exports.modify_* calls.
1527 // Clearing the sync flags prevents this possible case.
1528 exports.clear_sync_state();
1529 exports.modify_device();
1530 }
1531
1532 if (numCols == 1) { // special case for one column only
1533 // MultiVector always represents a single column with constant
1534 // stride, but it doesn't hurt to implement both cases anyway.
1535 //
1536 // ETP: I'm not sure I agree with the above statement. Can't a single-
1537 // column multivector be a subview of another multi-vector, in which case
1538 // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1539 // separately.
1540 //
1541 // mfh 18 Jan 2016: In answer to ETP's comment above:
1542 // MultiVector treats single-column MultiVectors created using a
1543 // "nonconstant stride constructor" as a special case, and makes
1544 // them constant stride (by making whichVectors_ have length 0).
1545 if (sourceMV.isConstantStride()) {
1546 using KokkosRefactor::Details::pack_array_single_column;
1547 if (printDebugOutput) {
1548 std::ostringstream os;
1549 os << *prefix << "Pack numCols=1 const stride" << endl;
1550 std::cerr << os.str();
1551 }
1552 if (packOnHost) {
1553 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1554 pack_array_single_column(exports.view_host(),
1555 src_host,
1556 exportLIDs.view_host(),
1557 0,
1559 } else { // pack on device
1560 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1561 pack_array_single_column(exports.view_device(),
1562 src_dev,
1563 exportLIDs.view_device(),
1564 0,
1566 space);
1567 }
1568 } else {
1569 using KokkosRefactor::Details::pack_array_single_column;
1570 if (printDebugOutput) {
1571 std::ostringstream os;
1572 os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1573 std::cerr << os.str();
1574 }
1575 if (packOnHost) {
1576 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1577 pack_array_single_column(exports.view_host(),
1578 src_host,
1579 exportLIDs.view_host(),
1580 sourceMV.whichVectors_[0],
1582 } else { // pack on device
1583 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1584 pack_array_single_column(exports.view_device(),
1585 src_dev,
1586 exportLIDs.view_device(),
1587 sourceMV.whichVectors_[0],
1589 space);
1590 }
1591 }
1592 } else { // the source MultiVector has multiple columns
1593 if (sourceMV.isConstantStride()) {
1594 using KokkosRefactor::Details::pack_array_multi_column;
1595 if (printDebugOutput) {
1596 std::ostringstream os;
1597 os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1598 std::cerr << os.str();
1599 }
1600 if (packOnHost) {
1601 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1602 pack_array_multi_column(exports.view_host(),
1603 src_host,
1604 exportLIDs.view_host(),
1605 numCols,
1607 } else { // pack on device
1608 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1609 pack_array_multi_column(exports.view_device(),
1610 src_dev,
1611 exportLIDs.view_device(),
1612 numCols,
1614 space);
1616 } else {
1617 using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1619 std::ostringstream os;
1620 os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1621 << endl;
1622 std::cerr << os.str();
1623 }
1624 // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1625 // whichVectors_ can be expensive, but pack and unpack for
1626 // nonconstant-stride MultiVectors is slower anyway.
1627 using IST = impl_scalar_type;
1628 using DV = Kokkos::DualView<IST*, device_type>;
1629 using HES = typename DV::t_host::execution_space;
1630 using DES = typename DV::t_dev::execution_space;
1631 Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_();
1632 if (packOnHost) {
1633 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1634 pack_array_multi_column_variable_stride(exports.view_host(),
1635 src_host,
1636 exportLIDs.view_host(),
1638 numCols,
1640 } else { // pack on device
1641 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1642 pack_array_multi_column_variable_stride(exports.view_device(),
1643 src_dev,
1644 exportLIDs.view_device(),
1646 numCols,
1648 }
1649 }
1650 }
1651
1652 if (printDebugOutput) {
1653 std::ostringstream os;
1654 os << *prefix << "Done!" << endl;
1655 std::cerr << os.str();
1656 }
1657}
1659template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1662 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1663 Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1664 Kokkos::DualView<size_t*, buffer_device_type> numExportPacketsPerLID,
1665 size_t& constantNumPackets) {
1667}
1669template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1670template <class NO>
1671typename std::enable_if<std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1672 typename NO::device_type::memory_space>::value,
1673 bool>::type
1676 const bool verbose,
1677 const std::string* prefix,
1678 const bool areRemoteLIDsContiguous,
1680 // This implementation of reallocImportsIfNeeded is an
1681 // optimization that is specific to MultiVector. We check if the
1682 // imports_ view can be aliased to the end of the data view_. If
1683 // that is the case, we can skip the unpackAndCombine call.
1684
1685 if (verbose) {
1686 std::ostringstream os;
1687 os << *prefix << "Realloc (if needed) imports_ from "
1688 << this->imports_.extent(0) << " to " << newSize << std::endl;
1689 std::cerr << os.str();
1690 }
1691
1692 bool reallocated = false;
1693
1694 using IST = impl_scalar_type;
1695 using DV = Kokkos::DualView<IST*, Kokkos::LayoutLeft, buffer_device_type>;
1696
1697 // Conditions for aliasing memory:
1698 // - When both sides of the dual view are in the same memory
1699 // space, we do not need to worry about syncing things.
1700 // - When both memory spaces are different, we only alias if this
1701 // does not incur additional sync'ing.
1702 // - The remote LIDs need to be contiguous, so that we do not need
1703 // to reorder received information.
1704 // - CombineMode needs to be INSERT.
1705 // - The number of vectors needs to be 1, otherwise we need to
1706 // reorder the received data.
1707 if ((std::is_same_v<typename dual_view_type::t_dev::device_type, typename dual_view_type::t_host::device_type> ||
1708 (Details::Behavior::assumeMpiIsGPUAware() && !this->need_sync_device()) ||
1709 (!Details::Behavior::assumeMpiIsGPUAware() && !this->need_sync_host())) &&
1710 areRemoteLIDsContiguous &&
1711 (CM == INSERT || CM == REPLACE) &&
1712 (getNumVectors() == 1) &&
1713 (newSize > 0)) {
1714 size_t offset = getLocalLength() - newSize;
1715 reallocated = this->imports_.view_device().data() != view_.getDualView().view_device().data() + offset;
1716 if (reallocated) {
1717 typedef std::pair<size_t, size_t> range_type;
1718 this->imports_ = DV(view_.getDualView(),
1719 range_type(offset, getLocalLength()),
1720 0);
1721
1722 if (verbose) {
1723 std::ostringstream os;
1724 os << *prefix << "Aliased imports_ to MV.view_" << std::endl;
1725 std::cerr << os.str();
1726 }
1727 }
1728 return reallocated;
1729 }
1730 {
1731 using ::Tpetra::Details::reallocDualViewIfNeeded;
1732 reallocated =
1733 reallocDualViewIfNeeded(this->unaliased_imports_, newSize, "imports");
1734 if (verbose) {
1735 std::ostringstream os;
1736 os << *prefix << "Finished realloc'ing unaliased_imports_" << std::endl;
1737 std::cerr << os.str();
1738 }
1739 this->imports_ = this->unaliased_imports_;
1740 }
1741 return reallocated;
1742}
1743
1744template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1745template <class NO>
1746typename std::enable_if<!std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1747 typename NO::device_type::memory_space>::value,
1748 bool>::type
1751 const bool verbose,
1752 const std::string* prefix,
1753 const bool areRemoteLIDsContiguous,
1754 const CombineMode CM) {
1756}
1757
1758template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1760 reallocImportsIfNeeded(const size_t newSize,
1761 const bool verbose,
1762 const std::string* prefix,
1763 const bool areRemoteLIDsContiguous,
1764 const CombineMode CM) {
1766 return reallocImportsIfNeededImpl<Node>(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1767}
1768
1769template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1772 return (this->imports_.view_device().data() + this->imports_.view_device().extent(0) ==
1773 view_.getDualView().view_device().data() + view_.getDualView().view_device().extent(0));
1774}
1775
1776template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1778 unpackAndCombine(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1779 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1780 Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1781 const size_t constantNumPackets,
1782 const CombineMode CM,
1783 const execution_space& space) {
1784 using Kokkos::Compat::getKokkosViewDeepCopy;
1785 using KokkosRefactor::Details::unpack_array_multi_column;
1786 using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1787 using std::endl;
1788 using ::Tpetra::Details::Behavior;
1789 using ::Tpetra::Details::ProfilingRegion;
1790
1791 const bool unpackOnHost = runKernelOnHost(imports);
1792
1793 const char longFuncNameHost[] = "Tpetra::MultiVector::unpackAndCombine[Host]";
1794 const char longFuncNameDevice[] = "Tpetra::MultiVector::unpackAndCombine[Device]";
1796 const char tfecfFuncName[] = "unpackAndCombine: ";
1797
1798 // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1799 // have the option to check indices. We do so when Tpetra is in
1800 // debug mode. It is in debug mode by default in a debug build,
1801 // but you may control this at run time, before launching the
1802 // executable, by setting the TPETRA_DEBUG environment variable to
1803 // "1" (or "TRUE").
1804 const bool debugCheckIndices = Behavior::debug();
1805
1806 const bool printDebugOutput = Behavior::verbose();
1807 std::unique_ptr<std::string> prefix;
1808 if (printDebugOutput) {
1809 auto map = this->getMap();
1810 auto comm = map.is_null() ? Teuchos::null : map->getComm();
1811 const int myRank = comm.is_null() ? -1 : comm->getRank();
1812 std::ostringstream os;
1813 os << "Proc " << myRank << ": " << longFuncName << ": ";
1814 prefix = std::unique_ptr<std::string>(new std::string(os.str()));
1815 os << "Start" << endl;
1816 std::cerr << os.str();
1817 }
1818
1819 // If we have no imports, there is nothing to do
1820 if (importLIDs.extent(0) == 0) {
1821 if (printDebugOutput) {
1822 std::ostringstream os;
1823 os << *prefix << "No imports. Done!" << endl;
1824 std::cerr << os.str();
1825 }
1826 return;
1827 }
1828
1829 // Check, whether imports_ is a subview of the MV view.
1830 if (importsAreAliased()) {
1831 if (printDebugOutput) {
1832 std::ostringstream os;
1833 os << *prefix << "Skipping unpack, since imports_ is aliased to MV.view_. Done!" << endl;
1834 std::cerr << os.str();
1835 }
1836 return;
1838
1839 ProfilingRegion regionUAC(longFuncName);
1840
1841 const size_t numVecs = getNumVectors();
1842 if (debugCheckIndices) {
1843 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(imports.extent(0)) !=
1844 numVecs * importLIDs.extent(0),
1845 std::runtime_error,
1846 "imports.extent(0) = " << imports.extent(0)
1847 << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1848 << " * " << importLIDs.extent(0) << " = "
1849 << numVecs * importLIDs.extent(0) << ".");
1850
1851 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(constantNumPackets == static_cast<size_t>(0), std::runtime_error,
1852 "constantNumPackets input argument must be nonzero.");
1853
1854 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(numVecs) !=
1855 static_cast<size_t>(constantNumPackets),
1856 std::runtime_error, "constantNumPackets must equal numVecs.");
1857 }
1858
1859 // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1860 // the memory space in which the imports buffer was last modified and
1861 // the size of the imports buffer.
1862 // DistObject::doTransferNew decides where it was last modified (based on
1863 // whether communication buffers used were on host or device).
1864 if (unpackOnHost) {
1865 if (this->imports_.need_sync_host()) this->imports_.sync_host();
1866 } else {
1867 if (this->imports_.need_sync_device()) this->imports_.sync_device();
1868 }
1869
1870 if (printDebugOutput) {
1871 std::ostringstream os;
1872 os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1873 << endl;
1874 std::cerr << os.str();
1875 }
1876
1877 // We have to sync before modifying, because this method may read
1878 // as well as write (depending on the CombineMode).
1879 auto imports_d = imports.view_device();
1880 auto imports_h = imports.view_host();
1881 auto importLIDs_d = importLIDs.view_device();
1882 auto importLIDs_h = importLIDs.view_host();
1883
1884 Kokkos::DualView<size_t*, device_type> whichVecs;
1885 if (!isConstantStride()) {
1886 Kokkos::View<const size_t*, Kokkos::HostSpace,
1887 Kokkos::MemoryUnmanaged>
1888 whichVecsIn(whichVectors_.getRawPtr(),
1889 numVecs);
1890 whichVecs = Kokkos::DualView<size_t*, device_type>("whichVecs", numVecs);
1891 if (unpackOnHost) {
1892 whichVecs.modify_host();
1893 // DEEP_COPY REVIEW - NOT TESTED FOR CUDA BUILD
1894 Kokkos::deep_copy(whichVecs.view_host(), whichVecsIn);
1895 } else {
1896 whichVecs.modify_device();
1897 // DEEP_COPY REVIEW - HOST-TO-DEVICE
1898 Kokkos::deep_copy(whichVecs.view_device(), whichVecsIn);
1899 }
1900 }
1901 auto whichVecs_d = whichVecs.view_device();
1902 auto whichVecs_h = whichVecs.view_host();
1903
1904 /* The layout in the export for MultiVectors is as follows:
1905 imports = { all of the data from row exportLIDs.front() ;
1906 ....
1907 all of the data from row exportLIDs.back() }
1908 This doesn't have the best locality, but is necessary because
1909 the data for a Packet (all data associated with an LID) is
1910 required to be contiguous. */
1911
1912 if (numVecs > 0 && importLIDs.extent(0) > 0) {
1913 using dev_exec_space = typename dual_view_type::t_dev::execution_space;
1914 using host_exec_space = typename dual_view_type::t_host::execution_space;
1915
1916 // This fixes GitHub Issue #4418.
1917 const bool use_atomic_updates = unpackOnHost ? host_exec_space().concurrency() != 1 : dev_exec_space().concurrency() != 1;
1918
1919 if (printDebugOutput) {
1920 std::ostringstream os;
1921 os << *prefix << "Unpack: " << combineModeToString(CM) << endl;
1922 std::cerr << os.str();
1923 }
1924
1925 // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
1926 // custom combine modes, start editing here.
1927
1928 if (CM == INSERT || CM == REPLACE) {
1929 using op_type = KokkosRefactor::Details::InsertOp;
1930 if (isConstantStride()) {
1931 if (unpackOnHost) {
1932 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1933 unpack_array_multi_column(host_exec_space(),
1935 op_type(), numVecs,
1938
1939 } else { // unpack on device
1940 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1941 unpack_array_multi_column(space,
1943 op_type(), numVecs,
1946 }
1947 } else { // not constant stride
1948 if (unpackOnHost) {
1949 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1950 unpack_array_multi_column_variable_stride(host_exec_space(),
1951 X_h, imports_h,
1954 op_type(),
1958 } else { // unpack on device
1959 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1960 unpack_array_multi_column_variable_stride(space,
1961 X_d, imports_d,
1964 op_type(),
1965 numVecs,
1968 }
1969 }
1970 } else if (CM == ADD || CM == ADD_ASSIGN) {
1971 using op_type = KokkosRefactor::Details::AddOp;
1972 if (isConstantStride()) {
1973 if (unpackOnHost) {
1974 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1975 unpack_array_multi_column(host_exec_space(),
1977 op_type(), numVecs,
1980 } else { // unpack on device
1981 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1982 unpack_array_multi_column(space,
1984 op_type(), numVecs,
1987 }
1988 } else { // not constant stride
1989 if (unpackOnHost) {
1990 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1991 unpack_array_multi_column_variable_stride(host_exec_space(),
1992 X_h, imports_h,
1995 op_type(),
1996 numVecs,
1999 } else { // unpack on device
2000 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2001 unpack_array_multi_column_variable_stride(space,
2002 X_d, imports_d,
2005 op_type(),
2006 numVecs,
2009 }
2010 }
2011 } else if (CM == ABSMAX) {
2012 using op_type = KokkosRefactor::Details::AbsMaxOp;
2013 if (isConstantStride()) {
2014 if (unpackOnHost) {
2015 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2016 unpack_array_multi_column(host_exec_space(),
2018 op_type(), numVecs,
2021 } else { // unpack on device
2022 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2023 unpack_array_multi_column(space,
2025 op_type(), numVecs,
2028 }
2029 } else {
2030 if (unpackOnHost) {
2031 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2032 unpack_array_multi_column_variable_stride(host_exec_space(),
2033 X_h, imports_h,
2036 op_type(),
2037 numVecs,
2040 } else { // unpack on device
2041 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2042 unpack_array_multi_column_variable_stride(space,
2043 X_d, imports_d,
2046 op_type(),
2047 numVecs,
2050 }
2051 }
2052 } else {
2053 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, "Invalid CombineMode");
2054 }
2055 } else {
2056 if (printDebugOutput) {
2057 std::ostringstream os;
2058 os << *prefix << "Nothing to unpack" << endl;
2059 std::cerr << os.str();
2060 }
2061 }
2062
2063 if (printDebugOutput) {
2064 std::ostringstream os;
2065 os << *prefix << "Done!" << endl;
2066 std::cerr << os.str();
2067 }
2068}
2069
2070template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2072 unpackAndCombine(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
2073 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
2074 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
2076 const CombineMode CM) {
2077 unpackAndCombine(importLIDs, imports, numPacketsPerLID, constantNumPackets, CM, execution_space());
2078}
2079
2080template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2081size_t
2083 getNumVectors() const {
2084 if (isConstantStride()) {
2085 return static_cast<size_t>(view_.extent(1));
2086 } else {
2087 return static_cast<size_t>(whichVectors_.size());
2088 }
2089}
2090
2091namespace { // (anonymous)
2092
2093template <class RV>
2095 const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2096 const bool distributed) {
2097 using Teuchos::REDUCE_MAX;
2098 using Teuchos::REDUCE_SUM;
2099 using Teuchos::reduceAll;
2100 typedef typename RV::non_const_value_type dot_type;
2101
2102 Details::ProfilingRegion region("Tpetra::gblDotImpl");
2103
2104 const size_t numVecs = dotsOut.extent(0);
2105
2106 // If the MultiVector is distributed over multiple processes, do
2107 // the distributed (interprocess) part of the dot product. We
2108 // assume that the MPI implementation can read from and write to
2109 // device memory.
2110 //
2111 // replaceMap() may have removed some processes. Those
2112 // processes have a null Map. They must not participate in any
2113 // collective operations. We ask first whether the Map is null,
2114 // because isDistributed() defers that question to the Map. We
2115 // still compute and return local dots for processes not
2116 // participating in collective operations; those probably don't
2117 // make any sense, but it doesn't hurt to do them, since it's
2118 // illegal to call dot() on those processes anyway.
2119 if (distributed && !comm.is_null()) {
2120 // The calling process only participates in the collective if
2121 // both the Map and its Comm on that process are nonnull.
2122 const int nv = static_cast<int>(numVecs);
2124
2125 if (commIsInterComm) {
2126 // If comm is an intercomm, then we may not alias input and
2127 // output buffers, so we have to make a copy of the local
2128 // sum.
2129 typename RV::non_const_type lclDots(Kokkos::ViewAllocateWithoutInitializing("tmp"), numVecs);
2130 // DEEP_COPY REVIEW - NOT TESTED
2131 Kokkos::deep_copy(lclDots, dotsOut);
2132 const dot_type* const lclSum = lclDots.data();
2133 dot_type* const gblSum = dotsOut.data();
2135 } else {
2136 dot_type* const inout = dotsOut.data();
2138 }
2139 }
2140}
2141} // namespace
2143template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2146 const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const {
2147 using Kokkos::subview;
2148 using Teuchos::Comm;
2149 using Teuchos::null;
2150 using Teuchos::RCP;
2151 using ::Tpetra::Details::Behavior;
2152 // View of all the dot product results.
2153 typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
2154 typedef typename dual_view_type::t_dev_const XMV;
2155 const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
2156
2157 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::dot (Kokkos::View)");
2159 const size_t numVecs = this->getNumVectors();
2160 if (numVecs == 0) {
2161 return; // nothing to do
2162 }
2163 const size_t lclNumRows = this->getLocalLength();
2164 const size_t numDots = static_cast<size_t>(dots.extent(0));
2165 const bool debug = Behavior::debug();
2166
2167 if (debug) {
2168 const bool compat = this->getMap()->isCompatible(*(A.getMap()));
2169 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!compat, std::invalid_argument,
2170 "'*this' MultiVector is not "
2171 "compatible with the input MultiVector A. We only test for this "
2172 "in debug mode.");
2173 }
2174
2175 // FIXME (mfh 11 Jul 2014) These exception tests may not
2176 // necessarily be thrown on all processes consistently. We should
2177 // instead pass along error state with the inner product. We
2178 // could do this by setting an extra slot to
2179 // Kokkos::ArithTraits<dot_type>::one() on error. The
2180 // final sum should be
2181 // Kokkos::ArithTraits<dot_type>::zero() if not error.
2183 lclNumRows != A.getLocalLength(), std::runtime_error,
2184 "MultiVectors do not have the same local length. "
2185 "this->getLocalLength() = "
2186 << lclNumRows << " != "
2187 "A.getLocalLength() = "
2188 << A.getLocalLength() << ".");
2190 numVecs != A.getNumVectors(), std::runtime_error,
2191 "MultiVectors must have the same number of columns (vectors). "
2192 "this->getNumVectors() = "
2193 << numVecs << " != "
2194 "A.getNumVectors() = "
2195 << A.getNumVectors() << ".");
2197 numDots != numVecs, std::runtime_error,
2198 "The output array 'dots' must have the same number of entries as the "
2199 "number of columns (vectors) in *this and A. dots.extent(0) = "
2200 << numDots << " != this->getNumVectors() = " << numVecs << ".");
2201
2202 const std::pair<size_t, size_t> colRng(0, numVecs);
2204 RCP<const Comm<int> > comm = this->getMap().is_null() ? null : this->getMap()->getComm();
2205
2206 auto thisView = this->getLocalViewDevice(Access::ReadOnly);
2207 auto A_view = A.getLocalViewDevice(Access::ReadOnly);
2208
2209 ::Tpetra::Details::lclDot<RV, XMV>(dotsOut, thisView, A_view, lclNumRows, numVecs,
2210 this->whichVectors_.getRawPtr(),
2211 A.whichVectors_.getRawPtr(),
2212 this->isConstantStride(), A.isConstantStride());
2213
2214 // lbv 15 mar 2023: Kokkos Kernels provides non-blocking BLAS
2215 // functions unless they explicitely return a value to Host.
2216 // Here while the lclDot are on host, they are not a return
2217 // value, therefore they might be avaible to us immediately.
2218 // Adding a frnce here guarantees that we will have the lclDot
2219 // ahead of the MPI reduction.
2221 exec_space_instance.fence("Tpetra::MultiVector::dot");
2222
2223 gblDotImpl(dotsOut, comm, this->isDistributed());
2224}
2225
2226namespace { // (anonymous)
2227template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2231 using ::Tpetra::Details::ProfilingRegion;
2233 using dot_type = typename MV::dot_type;
2234 ProfilingRegion region("Tpetra::multiVectorSingleColumnDot");
2235
2236 auto map = x.getMap();
2237 Teuchos::RCP<const Teuchos::Comm<int> > comm =
2238 map.is_null() ? Teuchos::null : map->getComm();
2239 if (comm.is_null()) {
2240 return KokkosKernels::ArithTraits<dot_type>::zero();
2241 } else {
2242 dot_type lclDot = KokkosKernels::ArithTraits<dot_type>::zero();
2243 dot_type gblDot = KokkosKernels::ArithTraits<dot_type>::zero();
2244 {
2245 ProfilingRegion regionLclDot("Tpetra::multiVectorSingleColumnDot::lclDot");
2246 using LO = LocalOrdinal;
2247 // The min just ensures that we don't overwrite memory that
2248 // doesn't belong to us, in the erroneous input case where x
2249 // and y have different numbers of rows.
2250 const LO lclNumRows = static_cast<LO>(std::min(x.getLocalLength(),
2251 y.getLocalLength()));
2252 const Kokkos::pair<LO, LO> rowRng(0, lclNumRows);
2253
2254 // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2255 // const_cast<MV&>(x).sync_device ();
2256 // const_cast<MV&>(y).sync_device ();
2257
2258 auto x_2d = x.getLocalViewDevice(Access::ReadOnly);
2259 auto x_1d = Kokkos::subview(x_2d, rowRng, 0);
2260 auto y_2d = y.getLocalViewDevice(Access::ReadOnly);
2261 auto y_1d = Kokkos::subview(y_2d, rowRng, 0);
2262 lclDot = KokkosBlas::dot(x_1d, y_1d);
2263 }
2264
2265 if (x.isDistributed()) {
2266 ProfilingRegion regionReduce("Tpetra::multiVectorSingleColumnDot::reduce");
2267 using Teuchos::outArg;
2268 using Teuchos::REDUCE_SUM;
2269 using Teuchos::reduceAll;
2271 } else {
2272 gblDot = lclDot;
2274 return gblDot;
2275 }
2277} // namespace
2279template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2282 const Teuchos::ArrayView<dot_type>& dots) const {
2283 const char tfecfFuncName[] = "dot: ";
2284 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::dot (Teuchos::ArrayView)");
2285
2286 const size_t numVecs = this->getNumVectors();
2287 const size_t lclNumRows = this->getLocalLength();
2288 const size_t numDots = static_cast<size_t>(dots.size());
2289
2290 // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2291 // not necessarily be thrown on all processes consistently. We
2292 // keep them for now, because MultiVector's unit tests insist on
2293 // them. In the future, we should instead pass along error state
2294 // with the inner product. We could do this by setting an extra
2295 // slot to Kokkos::ArithTraits<dot_type>::one() on error.
2296 // The final sum should be
2297 // Kokkos::ArithTraits<dot_type>::zero() if not error.
2298 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclNumRows != A.getLocalLength(), std::runtime_error,
2299 "MultiVectors do not have the same local length. "
2300 "this->getLocalLength() = "
2301 << lclNumRows << " != "
2302 "A.getLocalLength() = "
2303 << A.getLocalLength() << ".");
2304 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numVecs != A.getNumVectors(), std::runtime_error,
2305 "MultiVectors must have the same number of columns (vectors). "
2306 "this->getNumVectors() = "
2307 << numVecs << " != "
2308 "A.getNumVectors() = "
2309 << A.getNumVectors() << ".");
2311 "The output array 'dots' must have the same number of entries as the "
2312 "number of columns (vectors) in *this and A. dots.extent(0) = "
2313 << numDots << " != this->getNumVectors() = " << numVecs << ".");
2314
2315 if (numVecs == 1 && this->isConstantStride() && A.isConstantStride()) {
2317 dots[0] = gblDot;
2318 } else {
2319 this->dot(A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr(), numDots));
2320 }
2321}
2322
2323template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2325 norm2(const Teuchos::ArrayView<mag_type>& norms) const {
2327 using ::Tpetra::Details::NORM_TWO;
2328 using ::Tpetra::Details::ProfilingRegion;
2329 ProfilingRegion region("Tpetra::MV::norm2 (host output)");
2330
2331 // The function needs to be able to sync X.
2332 MV& X = const_cast<MV&>(*this);
2333 multiVectorNormImpl(norms.getRawPtr(), X, NORM_TWO);
2334}
2335
2336template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2338 norm2(const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const {
2339 Teuchos::ArrayView<mag_type> norms_av(norms.data(), norms.extent(0));
2340 this->norm2(norms_av);
2341}
2342
2343template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2345 norm1(const Teuchos::ArrayView<mag_type>& norms) const {
2347 using ::Tpetra::Details::NORM_ONE;
2348 using ::Tpetra::Details::ProfilingRegion;
2349 ProfilingRegion region("Tpetra::MV::norm1 (host output)");
2350
2351 // The function needs to be able to sync X.
2352 MV& X = const_cast<MV&>(*this);
2353 multiVectorNormImpl(norms.data(), X, NORM_ONE);
2354}
2355
2356template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2358 norm1(const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const {
2359 Teuchos::ArrayView<mag_type> norms_av(norms.data(), norms.extent(0));
2360 this->norm1(norms_av);
2361}
2362
2363template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2365 normInf(const Teuchos::ArrayView<mag_type>& norms) const {
2367 using ::Tpetra::Details::NORM_INF;
2368 using ::Tpetra::Details::ProfilingRegion;
2369 ProfilingRegion region("Tpetra::MV::normInf (host output)");
2370
2371 // The function needs to be able to sync X.
2372 MV& X = const_cast<MV&>(*this);
2373 multiVectorNormImpl(norms.getRawPtr(), X, NORM_INF);
2374}
2375
2376template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2378 normInf(const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const {
2379 Teuchos::ArrayView<mag_type> norms_av(norms.data(), norms.extent(0));
2380 this->normInf(norms_av);
2381}
2382
2383template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2385 meanValue(const Teuchos::ArrayView<impl_scalar_type>& means) const {
2386 // KR FIXME Overload this method to take a View.
2387 using Kokkos::ALL;
2388 using Kokkos::subview;
2389 using Teuchos::Comm;
2390 using Teuchos::RCP;
2391 using Teuchos::REDUCE_SUM;
2392 using Teuchos::reduceAll;
2393 typedef KokkosKernels::ArithTraits<impl_scalar_type> ATS;
2394
2395 const size_t lclNumRows = this->getLocalLength();
2396 const size_t numVecs = this->getNumVectors();
2397 const size_t numMeans = static_cast<size_t>(means.size());
2398
2400 numMeans != numVecs, std::runtime_error,
2401 "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2402 << " != this->getNumVectors() = " << numVecs << ".");
2403
2404 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2405 const std::pair<size_t, size_t> colRng(0, numVecs);
2406
2407 // Make sure that the final output view has the same layout as the
2408 // temporary view's host_mirror_type. Left or Right doesn't matter for
2409 // a 1-D array anyway; this is just to placate the compiler.
2410 typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2411 typedef Kokkos::View<impl_scalar_type*,
2412 typename local_view_type::host_mirror_type::array_layout,
2413 Kokkos::HostSpace,
2414 Kokkos::MemoryTraits<Kokkos::Unmanaged> >
2417
2418 RCP<const Comm<int> > comm = this->getMap().is_null() ? Teuchos::null : this->getMap()->getComm();
2419
2420 // If we need sync to device, then host has the most recent version.
2421 const bool useHostVersion = this->need_sync_device();
2422 if (useHostVersion) {
2423 // DualView was last modified on host, so run the local kernel there.
2424 auto X_lcl = subview(getLocalViewHost(Access::ReadOnly),
2425 rowRng, Kokkos::ALL());
2426 // Compute the local sum of each column.
2427 Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums("MV::meanValue tmp", numVecs);
2428 if (isConstantStride()) {
2429 KokkosBlas::sum(lclSums, X_lcl);
2430 } else {
2431 for (size_t j = 0; j < numVecs; ++j) {
2432 const size_t col = whichVectors_[j];
2433 KokkosBlas::sum(subview(lclSums, j), subview(X_lcl, ALL(), col));
2434 }
2435 }
2436
2437 // If there are multiple MPI processes, the all-reduce reads
2438 // from lclSums, and writes to meansOut. Otherwise, we just
2439 // copy lclSums into meansOut.
2440 if (!comm.is_null() && this->isDistributed()) {
2441 reduceAll(*comm, REDUCE_SUM, static_cast<int>(numVecs),
2442 lclSums.data(), meansOut.data());
2443 } else {
2444 // DEEP_COPY REVIEW - NOT TESTED
2445 Kokkos::deep_copy(meansOut, lclSums);
2446 }
2447 } else {
2448 // DualView was last modified on device, so run the local kernel there.
2449 auto X_lcl = subview(this->getLocalViewDevice(Access::ReadOnly),
2450 rowRng, Kokkos::ALL());
2451
2452 // Compute the local sum of each column.
2453 Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums("MV::meanValue tmp", numVecs);
2454 if (isConstantStride()) {
2455 KokkosBlas::sum(lclSums, X_lcl);
2456 } else {
2457 for (size_t j = 0; j < numVecs; ++j) {
2458 const size_t col = whichVectors_[j];
2459 KokkosBlas::sum(subview(lclSums, j), subview(X_lcl, ALL(), col));
2460 }
2461 }
2462 // lbv 10 mar 2023: Kokkos Kernels provides non-blocking BLAS
2463 // functions unless they explicitly return a value to Host.
2464 // Here while the lclSums are on the host, they are not a return
2465 // value, therefore they might be available to us immediately.
2466 // Adding a fence here guarantees that we will have the lclSums
2467 // ahead of the MPI reduction.
2469 exec_space_instance.fence("Tpetra::MultiVector::mean");
2470
2471 // If there are multiple MPI processes, the all-reduce reads
2472 // from lclSums, and writes to meansOut. (We assume that MPI
2473 // can read device memory.) Otherwise, we just copy lclSums
2474 // into meansOut.
2475 if (!comm.is_null() && this->isDistributed()) {
2476 reduceAll(*comm, REDUCE_SUM, static_cast<int>(numVecs),
2477 lclSums.data(), meansOut.data());
2478 } else {
2479 // DEEP_COPY REVIEW - HOST-TO-HOST - NOT TESTED FOR MPI BUILD
2480 Kokkos::deep_copy(meansOut, lclSums);
2481 }
2482 }
2483
2484 // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2485 // to the magnitude type, since operator/ (std::complex<T>, int)
2486 // isn't necessarily defined.
2488 ATS::one() / static_cast<mag_type>(this->getGlobalLength());
2489 for (size_t k = 0; k < numMeans; ++k) {
2491 }
2492}
2493
2494template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2496 randomize() {
2497 typedef impl_scalar_type IST;
2498 typedef KokkosKernels::ArithTraits<IST> ATS;
2499 typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2500 typedef typename pool_type::generator_type generator_type;
2501
2502 const IST max = Kokkos::rand<generator_type, IST>::max();
2503 const IST min = ATS::is_signed ? IST(-max) : ATS::zero();
2504
2505 this->randomize(static_cast<Scalar>(min), static_cast<Scalar>(max));
2506}
2507
2508template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2510 randomize(const Scalar& minVal, const Scalar& maxVal) {
2511 typedef impl_scalar_type IST;
2512 typedef Tpetra::Details::Static_Random_XorShift64_Pool<typename device_type::execution_space> tpetra_pool_type;
2513 typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2514
2515 // Seed the pool based on the system RNG and the MPI rank, if needed
2516 if (!tpetra_pool_type::isSet())
2517 tpetra_pool_type::resetPool(this->getMap()->getComm()->getRank());
2518
2519 pool_type& rand_pool = tpetra_pool_type::getPool();
2520 const IST max = static_cast<IST>(maxVal);
2521 const IST min = static_cast<IST>(minVal);
2522
2523 auto thisView = this->getLocalViewDevice(Access::OverwriteAll);
2524
2525 if (isConstantStride()) {
2526 Kokkos::fill_random(thisView, rand_pool, min, max);
2527 } else {
2528 const size_t numVecs = getNumVectors();
2529 for (size_t k = 0; k < numVecs; ++k) {
2530 const size_t col = whichVectors_[k];
2531 auto X_k = Kokkos::subview(thisView, Kokkos::ALL(), col);
2532 Kokkos::fill_random(X_k, rand_pool, min, max);
2533 }
2534 }
2535}
2536
2537template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2539 putScalar(const Scalar& alpha) {
2540 using ::Tpetra::Details::ProfilingRegion;
2541 using ::Tpetra::Details::Blas::fill;
2542 using DES = typename dual_view_type::t_dev::execution_space;
2543 using HES = typename dual_view_type::t_host::execution_space;
2544 using LO = LocalOrdinal;
2545 ProfilingRegion region("Tpetra::MultiVector::putScalar");
2546
2547 // We need this cast for cases like Scalar = std::complex<T> but
2548 // impl_scalar_type = Kokkos::complex<T>.
2549 const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
2550 const LO lclNumRows = static_cast<LO>(this->getLocalLength());
2551 const LO numVecs = static_cast<LO>(this->getNumVectors());
2552
2553 // Modify the most recently updated version of the data. This
2554 // avoids sync'ing, which could violate users' expectations.
2555 //
2556 // If we need sync to device, then host has the most recent version.
2557 const bool runOnHost = runKernelOnHost(*this);
2558 if (!runOnHost) {
2559 auto X = this->getLocalViewDevice(Access::OverwriteAll);
2560 if (this->isConstantStride()) {
2561 fill(DES(), X, theAlpha, lclNumRows, numVecs);
2562 } else {
2563 fill(DES(), X, theAlpha, lclNumRows, numVecs,
2564 this->whichVectors_.getRawPtr());
2565 }
2566 } else { // last modified in host memory, so modify data there.
2567 auto X = this->getLocalViewHost(Access::OverwriteAll);
2568 if (this->isConstantStride()) {
2569 fill(HES(), X, theAlpha, lclNumRows, numVecs);
2570 } else {
2571 fill(HES(), X, theAlpha, lclNumRows, numVecs,
2572 this->whichVectors_.getRawPtr());
2573 }
2574 }
2575}
2576
2577template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2579 replaceMap(const Teuchos::RCP<const map_type>& newMap) {
2580 using Teuchos::ArrayRCP;
2581 using Teuchos::Comm;
2582 using Teuchos::RCP;
2583 using ST = Scalar;
2584 using LO = LocalOrdinal;
2585 using GO = GlobalOrdinal;
2586
2587 // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2588 // it might work if the MV is a column view of another MV.
2589 // However, things might go wrong when restoring the original
2590 // Map, so we don't allow this case for now.
2592 !this->isConstantStride(), std::logic_error,
2593 "Tpetra::MultiVector::replaceMap: This method does not currently work "
2594 "if the MultiVector is a column view of another MultiVector (that is, if "
2595 "isConstantStride() == false).");
2596
2597 // Case 1: current Map and new Map are both nonnull on this process.
2598 // Case 2: current Map is nonnull, new Map is null.
2599 // Case 3: current Map is null, new Map is nonnull.
2600 // Case 4: both Maps are null: forbidden.
2601 //
2602 // Case 1 means that we don't have to do anything on this process,
2603 // other than assign the new Map. (We always have to do that.)
2604 // It's an error for the user to supply a Map that requires
2605 // resizing in this case.
2606 //
2607 // Case 2 means that the calling process is in the current Map's
2608 // communicator, but will be excluded from the new Map's
2609 // communicator. We don't have to do anything on the calling
2610 // process; just leave whatever data it may have alone.
2611 //
2612 // Case 3 means that the calling process is excluded from the
2613 // current Map's communicator, but will be included in the new
2614 // Map's communicator. This means we need to (re)allocate the
2615 // local DualView if it does not have the right number of rows.
2616 // If the new number of rows is nonzero, we'll fill the newly
2617 // allocated local data with zeros, as befits a projection
2618 // operation.
2619 //
2620 // The typical use case for Case 3 is that the MultiVector was
2621 // first created with the Map with more processes, then that Map
2622 // was replaced with a Map with fewer processes, and finally the
2623 // original Map was restored on this call to replaceMap.
2624
2625 if (this->getMap().is_null()) { // current Map is null
2626 // If this->getMap() is null, that means that this MultiVector
2627 // has already had replaceMap happen to it. In that case, just
2628 // reallocate the DualView with the right size.
2629
2631 newMap.is_null(), std::invalid_argument,
2632 "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2633 "This probably means that the input Map is incorrect.");
2634
2635 // Case 3: current Map is null, new Map is nonnull.
2636 // Reallocate the DualView with the right dimensions.
2637 const size_t newNumRows = newMap->getLocalNumElements();
2638 const size_t origNumRows = view_.extent(0);
2639 const size_t numCols = this->getNumVectors();
2640
2641 if (origNumRows != newNumRows || view_.extent(1) != numCols) {
2643 }
2644 } else if (newMap.is_null()) { // Case 2: current Map is nonnull, new Map is null
2645 // I am an excluded process. Reinitialize my data so that I
2646 // have 0 rows. Keep the number of columns as before.
2647 const size_t newNumRows = static_cast<size_t>(0);
2648 const size_t numCols = this->getNumVectors();
2650 }
2651
2652 this->map_ = newMap;
2653}
2654
2655template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2657 scale(const Scalar& alpha) {
2658 using Kokkos::ALL;
2659 using IST = impl_scalar_type;
2660
2661 const IST theAlpha = static_cast<IST>(alpha);
2662 if (theAlpha == KokkosKernels::ArithTraits<IST>::one()) {
2663 return; // do nothing
2664 }
2665 const size_t lclNumRows = getLocalLength();
2666 const size_t numVecs = getNumVectors();
2667 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2668 const std::pair<size_t, size_t> colRng(0, numVecs);
2669
2670 // We can't substitute putScalar(0.0) for scale(0.0), because the
2671 // former will overwrite NaNs present in the MultiVector. The
2672 // semantics of this call require multiplying them by 0, which
2673 // IEEE 754 requires to be NaN.
2674
2675 // If we need sync to device, then host has the most recent version.
2676 const bool useHostVersion = need_sync_device();
2677 if (useHostVersion) {
2678 auto Y_lcl = Kokkos::subview(getLocalViewHost(Access::ReadWrite), rowRng, ALL());
2679 if (isConstantStride()) {
2680 KokkosBlas::scal(Y_lcl, theAlpha, Y_lcl);
2681 } else {
2682 for (size_t k = 0; k < numVecs; ++k) {
2683 const size_t Y_col = whichVectors_[k];
2684 auto Y_k = Kokkos::subview(Y_lcl, ALL(), Y_col);
2685 KokkosBlas::scal(Y_k, theAlpha, Y_k);
2686 }
2687 }
2688 } else { // work on device
2689 auto Y_lcl = Kokkos::subview(getLocalViewDevice(Access::ReadWrite), rowRng, ALL());
2690 if (isConstantStride()) {
2691 KokkosBlas::scal(Y_lcl, theAlpha, Y_lcl);
2692 } else {
2693 for (size_t k = 0; k < numVecs; ++k) {
2694 const size_t Y_col = isConstantStride() ? k : whichVectors_[k];
2695 auto Y_k = Kokkos::subview(Y_lcl, ALL(), Y_col);
2696 KokkosBlas::scal(Y_k, theAlpha, Y_k);
2697 }
2698 }
2699 }
2700}
2701
2702template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2704 scale(const Teuchos::ArrayView<const Scalar>& alphas) {
2705 const size_t numVecs = this->getNumVectors();
2706 const size_t numAlphas = static_cast<size_t>(alphas.size());
2708 numAlphas != numVecs, std::invalid_argument,
2709 "Tpetra::MultiVector::"
2710 "scale: alphas.size() = "
2711 << numAlphas << " != this->getNumVectors() = "
2712 << numVecs << ".");
2713
2714 // Use a DualView to copy the scaling constants onto the device.
2715 using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2716 k_alphas_type k_alphas("alphas::tmp", numAlphas);
2717 k_alphas.modify_host();
2718 for (size_t i = 0; i < numAlphas; ++i) {
2719 k_alphas.view_host()(i) = static_cast<impl_scalar_type>(alphas[i]);
2720 }
2721 k_alphas.sync_device();
2722 // Invoke the scale() overload that takes a device View of coefficients.
2723 this->scale(k_alphas.view_device());
2724}
2725
2726template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2728 scale(const Kokkos::View<const impl_scalar_type*, device_type>& alphas) {
2729 using Kokkos::ALL;
2730 using Kokkos::subview;
2731
2732 const size_t lclNumRows = this->getLocalLength();
2733 const size_t numVecs = this->getNumVectors();
2735 static_cast<size_t>(alphas.extent(0)) != numVecs,
2736 std::invalid_argument,
2737 "Tpetra::MultiVector::scale(alphas): "
2738 "alphas.extent(0) = "
2739 << alphas.extent(0)
2740 << " != this->getNumVectors () = " << numVecs << ".");
2741 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2742 const std::pair<size_t, size_t> colRng(0, numVecs);
2743
2744 // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2745 // type of the return value of subview. This is because if we
2746 // switch the array layout from LayoutLeft to LayoutRight
2747 // (preferred for performance of block operations), the types
2748 // below won't be valid. (A view of a column of a LayoutRight
2749 // multivector has LayoutStride, not LayoutLeft.)
2750
2751 // If we need sync to device, then host has the most recent version.
2752 const bool useHostVersion = this->need_sync_device();
2753 if (useHostVersion) {
2754 // Work in host memory. This means we need to create a host
2755 // mirror of the input View of coefficients.
2756 auto alphas_h = Kokkos::create_mirror_view(alphas);
2757 // DEEP_COPY REVIEW - NOT TESTED
2758 Kokkos::deep_copy(alphas_h, alphas);
2759
2760 auto Y_lcl = subview(this->getLocalViewHost(Access::ReadWrite), rowRng, ALL());
2761 if (isConstantStride()) {
2762 KokkosBlas::scal(Y_lcl, alphas_h, Y_lcl);
2763 } else {
2764 for (size_t k = 0; k < numVecs; ++k) {
2765 const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2766 auto Y_k = subview(Y_lcl, ALL(), Y_col);
2767 // We don't have to use the entire 1-D View here; we can use
2768 // the version that takes a scalar coefficient.
2769 KokkosBlas::scal(Y_k, alphas_h(k), Y_k);
2770 }
2771 }
2772 } else { // Work in device memory, using the input View 'alphas' directly.
2773 auto Y_lcl = subview(this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL());
2774 if (isConstantStride()) {
2775 KokkosBlas::scal(Y_lcl, alphas, Y_lcl);
2776 } else {
2777 // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2778 // as values on host, so copy them to host. Another approach
2779 // would be to fix scal() so that it takes a 0-D View as the
2780 // second argument.
2781 auto alphas_h = Kokkos::create_mirror_view(alphas);
2782 // DEEP_COPY REVIEW - NOT TESTED
2783 Kokkos::deep_copy(alphas_h, alphas);
2784
2785 for (size_t k = 0; k < numVecs; ++k) {
2786 const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2787 auto Y_k = subview(Y_lcl, ALL(), Y_col);
2788 KokkosBlas::scal(Y_k, alphas_h(k), Y_k);
2789 }
2790 }
2791 }
2792}
2793
2794template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2796 scale(const Scalar& alpha,
2798 using Kokkos::ALL;
2799 using Kokkos::subview;
2800 const char tfecfFuncName[] = "scale: ";
2801
2802 const size_t lclNumRows = getLocalLength();
2803 const size_t numVecs = getNumVectors();
2804
2806 lclNumRows != A.getLocalLength(), std::invalid_argument,
2807 "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2808 << A.getLocalLength() << ".");
2810 numVecs != A.getNumVectors(), std::invalid_argument,
2811 "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2812 << A.getNumVectors() << ".");
2813
2814 const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
2815 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2816 const std::pair<size_t, size_t> colRng(0, numVecs);
2817
2818 auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2819 auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2820 auto Y_lcl = subview(Y_lcl_orig, rowRng, ALL());
2821 auto X_lcl = subview(X_lcl_orig, rowRng, ALL());
2822
2823 if (isConstantStride() && A.isConstantStride()) {
2824 KokkosBlas::scal(Y_lcl, theAlpha, X_lcl);
2825 } else {
2826 // Make sure that Kokkos only uses the local length for add.
2827 for (size_t k = 0; k < numVecs; ++k) {
2828 const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2829 const size_t X_col = A.isConstantStride() ? k : A.whichVectors_[k];
2830 auto Y_k = subview(Y_lcl, ALL(), Y_col);
2831 auto X_k = subview(X_lcl, ALL(), X_col);
2832
2833 KokkosBlas::scal(Y_k, theAlpha, X_k);
2834 }
2835 }
2836}
2837
2838template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2841 const char tfecfFuncName[] = "reciprocal: ";
2842
2844 getLocalLength() != A.getLocalLength(), std::runtime_error,
2845 "MultiVectors do not have the same local length. "
2846 "this->getLocalLength() = "
2847 << getLocalLength()
2848 << " != A.getLocalLength() = " << A.getLocalLength() << ".");
2850 A.getNumVectors() != this->getNumVectors(), std::runtime_error,
2851 ": MultiVectors do not have the same number of columns (vectors). "
2852 "this->getNumVectors() = "
2853 << getNumVectors()
2854 << " != A.getNumVectors() = " << A.getNumVectors() << ".");
2855
2856 const size_t numVecs = getNumVectors();
2857
2858 auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
2859 auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
2860
2861 if (isConstantStride() && A.isConstantStride()) {
2862 KokkosBlas::reciprocal(this_view_dev, A_view_dev);
2863 } else {
2864 using Kokkos::ALL;
2865 using Kokkos::subview;
2866 for (size_t k = 0; k < numVecs; ++k) {
2867 const size_t this_col = isConstantStride() ? k : whichVectors_[k];
2869 const size_t A_col = isConstantStride() ? k : A.whichVectors_[k];
2870 auto vector_Ak = subview(A_view_dev, ALL(), A_col);
2871 KokkosBlas::reciprocal(vector_k, vector_Ak);
2872 }
2873 }
2874}
2875
2876template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2879 const char tfecfFuncName[] = "abs";
2880
2882 getLocalLength() != A.getLocalLength(), std::runtime_error,
2883 ": MultiVectors do not have the same local length. "
2884 "this->getLocalLength() = "
2885 << getLocalLength()
2886 << " != A.getLocalLength() = " << A.getLocalLength() << ".");
2888 A.getNumVectors() != this->getNumVectors(), std::runtime_error,
2889 ": MultiVectors do not have the same number of columns (vectors). "
2890 "this->getNumVectors() = "
2891 << getNumVectors()
2892 << " != A.getNumVectors() = " << A.getNumVectors() << ".");
2893 const size_t numVecs = getNumVectors();
2894
2895 auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
2896 auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
2897
2898 if (isConstantStride() && A.isConstantStride()) {
2899 KokkosBlas::abs(this_view_dev, A_view_dev);
2900 } else {
2901 using Kokkos::ALL;
2902 using Kokkos::subview;
2903
2904 for (size_t k = 0; k < numVecs; ++k) {
2905 const size_t this_col = isConstantStride() ? k : whichVectors_[k];
2907 const size_t A_col = isConstantStride() ? k : A.whichVectors_[k];
2908 auto vector_Ak = subview(A_view_dev, ALL(), A_col);
2909 KokkosBlas::abs(vector_k, vector_Ak);
2910 }
2911 }
2912}
2913
2914template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2916 update(const Scalar& alpha,
2918 const Scalar& beta) {
2919 const char tfecfFuncName[] = "update: ";
2920 using Kokkos::ALL;
2921 using Kokkos::subview;
2922
2923 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::update(alpha,A,beta)");
2924
2925 const size_t lclNumRows = getLocalLength();
2926 const size_t numVecs = getNumVectors();
2927
2930 lclNumRows != A.getLocalLength(), std::invalid_argument,
2931 "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2932 << A.getLocalLength() << ".");
2934 numVecs != A.getNumVectors(), std::invalid_argument,
2935 "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2936 << A.getNumVectors() << ".");
2937 }
2938
2939 const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
2940 const impl_scalar_type theBeta = static_cast<impl_scalar_type>(beta);
2941 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2942 const std::pair<size_t, size_t> colRng(0, numVecs);
2943
2944 auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2945 auto Y_lcl = subview(Y_lcl_orig, rowRng, Kokkos::ALL());
2946 auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2947 auto X_lcl = subview(X_lcl_orig, rowRng, Kokkos::ALL());
2948
2949 // The device memory of *this is about to be modified
2950 if (isConstantStride() && A.isConstantStride()) {
2951 KokkosBlas::axpby(theAlpha, X_lcl, theBeta, Y_lcl);
2952 } else {
2953 // Make sure that Kokkos only uses the local length for add.
2954 for (size_t k = 0; k < numVecs; ++k) {
2955 const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2956 const size_t X_col = A.isConstantStride() ? k : A.whichVectors_[k];
2957 auto Y_k = subview(Y_lcl, ALL(), Y_col);
2958 auto X_k = subview(X_lcl, ALL(), X_col);
2959
2960 KokkosBlas::axpby(theAlpha, X_k, theBeta, Y_k);
2961 }
2962 }
2963}
2964
2965template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2967 update(const Scalar& alpha,
2969 const Scalar& beta,
2971 const Scalar& gamma) {
2972 using Kokkos::ALL;
2973 using Kokkos::subview;
2974
2975 const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
2976
2977 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::update(alpha,A,beta,B,gamma)");
2978
2979 const size_t lclNumRows = this->getLocalLength();
2980 const size_t numVecs = getNumVectors();
2981
2984 lclNumRows != A.getLocalLength(), std::invalid_argument,
2985 "The input MultiVector A has " << A.getLocalLength() << " local "
2986 "row(s), but this MultiVector has "
2987 << lclNumRows << " local "
2988 "row(s).");
2990 lclNumRows != B.getLocalLength(), std::invalid_argument,
2991 "The input MultiVector B has " << B.getLocalLength() << " local "
2992 "row(s), but this MultiVector has "
2993 << lclNumRows << " local "
2994 "row(s).");
2996 A.getNumVectors() != numVecs, std::invalid_argument,
2997 "The input MultiVector A has " << A.getNumVectors() << " column(s), "
2998 "but this MultiVector has "
2999 << numVecs << " column(s).");
3001 B.getNumVectors() != numVecs, std::invalid_argument,
3002 "The input MultiVector B has " << B.getNumVectors() << " column(s), "
3003 "but this MultiVector has "
3004 << numVecs << " column(s).");
3005 }
3006
3007 const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
3008 const impl_scalar_type theBeta = static_cast<impl_scalar_type>(beta);
3009 const impl_scalar_type theGamma = static_cast<impl_scalar_type>(gamma);
3010
3011 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
3012 const std::pair<size_t, size_t> colRng(0, numVecs);
3013
3014 // Prefer 'auto' over specifying the type explicitly. This avoids
3015 // issues with a subview possibly having a different type than the
3016 // original view.
3017 auto C_lcl = subview(this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL());
3018 auto A_lcl = subview(A.getLocalViewDevice(Access::ReadOnly), rowRng, ALL());
3019 auto B_lcl = subview(B.getLocalViewDevice(Access::ReadOnly), rowRng, ALL());
3020
3021 if (isConstantStride() && A.isConstantStride() && B.isConstantStride()) {
3022 KokkosBlas::update(theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3023 } else {
3024 // Some input (or *this) is not constant stride,
3025 // so perform the update one column at a time.
3026 for (size_t k = 0; k < numVecs; ++k) {
3027 const size_t this_col = isConstantStride() ? k : whichVectors_[k];
3028 const size_t A_col = A.isConstantStride() ? k : A.whichVectors_[k];
3029 const size_t B_col = B.isConstantStride() ? k : B.whichVectors_[k];
3030 KokkosBlas::update(theAlpha, subview(A_lcl, rowRng, A_col),
3031 theBeta, subview(B_lcl, rowRng, B_col),
3033 }
3034 }
3035}
3036
3037template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3038Teuchos::ArrayRCP<const Scalar>
3040 getData(size_t j) const {
3041 using Kokkos::ALL;
3042 using IST = impl_scalar_type;
3043 const char tfecfFuncName[] = "getData: ";
3044
3045 // Any MultiVector method that called the (classic) Kokkos Node's
3046 // viewBuffer or viewBufferNonConst methods always implied a
3047 // device->host synchronization. Thus, we synchronize here as
3048 // well.
3049
3050 auto hostView = getLocalViewHost(Access::ReadOnly);
3051 const size_t col = isConstantStride() ? j : whichVectors_[j];
3052 auto hostView_j = Kokkos::subview(hostView, ALL(), col);
3053 Teuchos::ArrayRCP<const IST> dataAsArcp =
3054 Kokkos::Compat::persistingView(hostView_j, 0, getLocalLength());
3055
3056 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(hostView_j.extent(0)) <
3057 static_cast<size_t>(dataAsArcp.size()),
3058 std::logic_error,
3059 "hostView_j.extent(0) = " << hostView_j.extent(0)
3060 << " < dataAsArcp.size() = " << dataAsArcp.size() << ". "
3061 "Please report this bug to the Tpetra developers.");
3062
3063 return Teuchos::arcp_reinterpret_cast<const Scalar>(dataAsArcp);
3064}
3065
3066template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3067Teuchos::ArrayRCP<Scalar>
3069 getDataNonConst(size_t j) {
3070 using Kokkos::ALL;
3071 using Kokkos::subview;
3072 using IST = impl_scalar_type;
3073 const char tfecfFuncName[] = "getDataNonConst: ";
3074
3075 auto hostView = getLocalViewHost(Access::ReadWrite);
3076 const size_t col = isConstantStride() ? j : whichVectors_[j];
3077 auto hostView_j = subview(hostView, ALL(), col);
3078 Teuchos::ArrayRCP<IST> dataAsArcp =
3079 Kokkos::Compat::persistingView(hostView_j, 0, getLocalLength());
3080
3081 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(hostView_j.extent(0)) <
3082 static_cast<size_t>(dataAsArcp.size()),
3083 std::logic_error,
3084 "hostView_j.extent(0) = " << hostView_j.extent(0)
3085 << " < dataAsArcp.size() = " << dataAsArcp.size() << ". "
3086 "Please report this bug to the Tpetra developers.");
3087
3088 return Teuchos::arcp_reinterpret_cast<Scalar>(dataAsArcp);
3089}
3090
3091template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3092Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3094 subCopy(const Teuchos::ArrayView<const size_t>& cols) const {
3095 using Teuchos::RCP;
3097
3098 // Check whether the index set in cols is contiguous. If it is,
3099 // use the more efficient Range1D version of subCopy.
3100 bool contiguous = true;
3101 const size_t numCopyVecs = static_cast<size_t>(cols.size());
3102 for (size_t j = 1; j < numCopyVecs; ++j) {
3103 if (cols[j] != cols[j - 1] + static_cast<size_t>(1)) {
3104 contiguous = false;
3105 break;
3106 }
3107 }
3108 if (contiguous && numCopyVecs > 0) {
3109 return this->subCopy(Teuchos::Range1D(cols[0], cols[numCopyVecs - 1]));
3110 } else {
3111 RCP<const MV> X_sub = this->subView(cols);
3112 RCP<MV> Y(new MV(this->getMap(), numCopyVecs, false));
3113 Y->assign(*X_sub);
3114 return Y;
3115 }
3116}
3117
3118template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3119Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3121 subCopy(const Teuchos::Range1D& colRng) const {
3122 using Teuchos::RCP;
3124
3125 RCP<const MV> X_sub = this->subView(colRng);
3126 RCP<MV> Y(new MV(this->getMap(), static_cast<size_t>(colRng.size()), false));
3127 Y->assign(*X_sub);
3128 return Y;
3129}
3130
3131template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3132size_t
3134 getOrigNumLocalRows() const {
3135 return view_.origExtent(0);
3136}
3137
3138template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3139size_t
3141 getOrigNumLocalCols() const {
3142 return view_.origExtent(1);
3143}
3144
3145template <class Scalar, class LO, class GO, class Node>
3148 const Teuchos::RCP<const map_type>& subMap,
3149 const local_ordinal_type rowOffset)
3150 : base_type(subMap) {
3151 using Kokkos::ALL;
3152 using Kokkos::subview;
3153 using std::endl;
3154 using Teuchos::outArg;
3155 using Teuchos::RCP;
3156 using Teuchos::rcp;
3157 using Teuchos::REDUCE_MIN;
3158 using Teuchos::reduceAll;
3160 const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3161 const char suffix[] = "Please report this bug to the Tpetra developers.";
3162 int lclGood = 1;
3163 int gblGood = 1;
3164 std::unique_ptr<std::ostringstream> errStrm;
3165 const bool debug = ::Tpetra::Details::Behavior::debug();
3166 const bool verbose = ::Tpetra::Details::Behavior::verbose();
3167
3168 // Be careful to use the input Map's communicator, not X's. The
3169 // idea is that, on return, *this is a subview of X, using the
3170 // input Map.
3171 const auto comm = subMap->getComm();
3172 TEUCHOS_ASSERT(!comm.is_null());
3173 const int myRank = comm->getRank();
3174
3175 const LO lclNumRowsBefore = static_cast<LO>(X.getLocalLength());
3176 const LO numCols = static_cast<LO>(X.getNumVectors());
3177 const LO newNumRows = static_cast<LO>(subMap->getLocalNumElements());
3178 if (verbose) {
3179 std::ostringstream os;
3180 os << "Proc " << myRank << ": " << prefix
3181 << "X: {lclNumRows: " << lclNumRowsBefore
3182 << ", origLclNumRows: " << X.getOrigNumLocalRows()
3183 << ", numCols: " << numCols << "}, "
3184 << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3185 std::cerr << os.str();
3186 }
3187 // We ask for the _original_ number of rows in X, because X could
3188 // be a shorter (fewer rows) view of a longer MV. (For example, X
3189 // could be a domain Map view of a column Map MV.)
3190 const bool tooManyElts =
3191 newNumRows + rowOffset > static_cast<LO>(X.getOrigNumLocalRows());
3192 if (tooManyElts) {
3193 errStrm = std::unique_ptr<std::ostringstream>(new std::ostringstream);
3194 *errStrm << " Proc " << myRank << ": subMap->getLocalNumElements() (="
3195 << newNumRows << ") + rowOffset (=" << rowOffset
3196 << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows()
3197 << ")." << endl;
3198 lclGood = 0;
3199 TEUCHOS_TEST_FOR_EXCEPTION(!debug && tooManyElts, std::invalid_argument,
3200 prefix << errStrm->str() << suffix);
3201 }
3202
3203 if (debug) {
3205 if (gblGood != 1) {
3206 std::ostringstream gblErrStrm;
3207 const std::string myErrStr =
3208 errStrm.get() != nullptr ? errStrm->str() : std::string("");
3210 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, gblErrStrm.str());
3211 }
3212 }
3213
3214 using range_type = std::pair<LO, LO>;
3215 const range_type origRowRng(rowOffset, static_cast<LO>(X.view_.origExtent(0)));
3216 const range_type rowRng(rowOffset, rowOffset + newNumRows);
3217
3218 wrapped_dual_view_type newView = takeSubview(X.view_, rowRng, ALL());
3219
3220 // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3221 // handling subviews of degenerate Views quite so well. For some
3222 // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3223 // 0. We work around by creating a new empty DualView of the
3224 // desired (degenerate) dimensions.
3225 if (newView.extent(0) == 0 &&
3226 newView.extent(1) != X.view_.extent(1)) {
3227 newView =
3228 allocDualView<Scalar, LO, GO, Node>(0, X.getNumVectors());
3229 }
3230
3231 MV subViewMV = X.isConstantStride() ? MV(subMap, newView) : MV(subMap, newView, X.whichVectors_());
3232
3233 if (debug) {
3234 const LO lclNumRowsRet = static_cast<LO>(subViewMV.getLocalLength());
3235 const LO numColsRet = static_cast<LO>(subViewMV.getNumVectors());
3236 if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3237 lclGood = 0;
3238 if (errStrm.get() == nullptr) {
3239 errStrm = std::unique_ptr<std::ostringstream>(new std::ostringstream);
3240 }
3241 *errStrm << " Proc " << myRank << ": subMap.getLocalNumElements(): " << newNumRows << ", subViewMV.getLocalLength(): " << lclNumRowsRet << ", X.getNumVectors(): " << numCols << ", subViewMV.getNumVectors(): " << numColsRet << endl;
3242 }
3244 if (gblGood != 1) {
3245 std::ostringstream gblErrStrm;
3246 if (myRank == 0) {
3247 gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3248 "dimensions on one or more processes:"
3249 << endl;
3250 }
3251 const std::string myErrStr =
3252 errStrm.get() != nullptr ? errStrm->str() : std::string("");
3254 gblErrStrm << suffix << endl;
3255 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, gblErrStrm.str());
3256 }
3257 }
3258
3259 if (verbose) {
3260 std::ostringstream os;
3261 os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3262 std::cerr << os.str();
3263 }
3264
3265 *this = subViewMV; // shallow copy
3266
3267 if (verbose) {
3268 std::ostringstream os;
3269 os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3270 std::cerr << os.str();
3271 }
3272}
3273
3274template <class Scalar, class LO, class GO, class Node>
3277 const map_type& subMap,
3278 const size_t rowOffset)
3279 : MultiVector(X, Teuchos::RCP<const map_type>(new map_type(subMap)),
3281
3282template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3283Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3285 offsetView(const Teuchos::RCP<const map_type>& subMap,
3286 const size_t offset) const {
3288 return Teuchos::rcp(new MV(*this, *subMap, offset));
3289}
3290
3291template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3292Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3294 offsetViewNonConst(const Teuchos::RCP<const map_type>& subMap,
3295 const size_t offset) {
3297 return Teuchos::rcp(new MV(*this, *subMap, offset));
3298}
3299
3300template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3301Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3303 subView(const Teuchos::ArrayView<const size_t>& cols) const {
3304 using Teuchos::Array;
3305 using Teuchos::rcp;
3307
3308 const size_t numViewCols = static_cast<size_t>(cols.size());
3310 numViewCols < 1, std::runtime_error,
3311 "Tpetra::MultiVector::subView"
3312 "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3313 "contain at least one entry, but cols.size() = "
3314 << cols.size()
3315 << " == 0.");
3316
3317 // Check whether the index set in cols is contiguous. If it is,
3318 // use the more efficient Range1D version of subView.
3319 bool contiguous = true;
3320 for (size_t j = 1; j < numViewCols; ++j) {
3321 if (cols[j] != cols[j - 1] + static_cast<size_t>(1)) {
3322 contiguous = false;
3323 break;
3324 }
3325 }
3326 if (contiguous) {
3327 if (numViewCols == 0) {
3328 // The output MV has no columns, so there is nothing to view.
3329 return rcp(new MV(this->getMap(), numViewCols));
3330 } else {
3331 // Use the more efficient contiguous-index-range version.
3332 return this->subView(Teuchos::Range1D(cols[0], cols[numViewCols - 1]));
3333 }
3334 }
3335
3336 if (isConstantStride()) {
3337 return rcp(new MV(this->getMap(), view_, cols));
3338 } else {
3339 Array<size_t> newcols(cols.size());
3340 for (size_t j = 0; j < numViewCols; ++j) {
3341 newcols[j] = whichVectors_[cols[j]];
3342 }
3343 return rcp(new MV(this->getMap(), view_, newcols()));
3344 }
3345}
3346
3347template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3348Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3350 subView(const Teuchos::Range1D& colRng) const {
3351 using Kokkos::ALL;
3352 using Kokkos::subview;
3353 using Teuchos::Array;
3354 using Teuchos::RCP;
3355 using Teuchos::rcp;
3356 using ::Tpetra::Details::Behavior;
3358 const char tfecfFuncName[] = "subView(Range1D): ";
3359
3360 const size_t lclNumRows = this->getLocalLength();
3361 const size_t numVecs = this->getNumVectors();
3362 // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3363 // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3364 // "at least one vector.");
3366 static_cast<size_t>(colRng.size()) > numVecs, std::runtime_error,
3367 "colRng.size() = " << colRng.size() << " > this->getNumVectors() = "
3368 << numVecs << ".");
3370 numVecs != 0 && colRng.size() != 0 &&
3371 (colRng.lbound() < static_cast<Teuchos::Ordinal>(0) ||
3372 static_cast<size_t>(colRng.ubound()) >= numVecs),
3373 std::invalid_argument, "Nonempty input range [" << colRng.lbound() << "," << colRng.ubound() << "] exceeds the valid range of column indices "
3374 "[0, "
3375 << numVecs << "].");
3376
3377 RCP<const MV> X_ret; // the MultiVector subview to return
3378
3379 // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3380 // broken for the case of views with zero rows. I will brutally
3381 // enforce that the subview has the correct dimensions. In
3382 // particular, in the case of zero rows, I will, if necessary,
3383 // create a new dual_view_type with zero rows and the correct
3384 // number of columns. In a debug build, I will use an all-reduce
3385 // to ensure that it has the correct dimensions on all processes.
3386
3387 const std::pair<size_t, size_t> rows(0, lclNumRows);
3388 if (colRng.size() == 0) {
3389 const std::pair<size_t, size_t> cols(0, 0); // empty range
3390 wrapped_dual_view_type X_sub = takeSubview(this->view_, ALL(), cols);
3391 X_ret = rcp(new MV(this->getMap(), X_sub));
3392 } else {
3393 // Returned MultiVector is constant stride only if *this is.
3394 if (isConstantStride()) {
3395 const std::pair<size_t, size_t> cols(colRng.lbound(),
3396 colRng.ubound() + 1);
3397 wrapped_dual_view_type X_sub = takeSubview(this->view_, ALL(), cols);
3398 X_ret = rcp(new MV(this->getMap(), X_sub));
3399 } else {
3400 if (static_cast<size_t>(colRng.size()) == static_cast<size_t>(1)) {
3401 // We're only asking for one column, so the result does have
3402 // constant stride, even though this MultiVector does not.
3403 const std::pair<size_t, size_t> col(whichVectors_[0] + colRng.lbound(),
3404 whichVectors_[0] + colRng.ubound() + 1);
3406 X_ret = rcp(new MV(this->getMap(), X_sub));
3407 } else {
3408 Array<size_t> which(whichVectors_.begin() + colRng.lbound(),
3409 whichVectors_.begin() + colRng.ubound() + 1);
3410 X_ret = rcp(new MV(this->getMap(), view_, which));
3411 }
3412 }
3413 }
3414
3415 const bool debug = Behavior::debug();
3416 if (debug) {
3417 using Teuchos::Comm;
3418 using Teuchos::outArg;
3419 using Teuchos::REDUCE_MIN;
3420 using Teuchos::reduceAll;
3421
3422 RCP<const Comm<int> > comm = this->getMap().is_null() ? Teuchos::null : this->getMap()->getComm();
3423 if (!comm.is_null()) {
3424 int lclSuccess = 1;
3425 int gblSuccess = 1;
3426
3427 if (X_ret.is_null()) {
3428 lclSuccess = 0;
3429 }
3432 "X_ret (the subview of this "
3433 "MultiVector; the return value of this method) is null on some MPI "
3434 "process in this MultiVector's communicator. This should never "
3435 "happen. Please report this bug to the Tpetra developers.");
3436 if (!X_ret.is_null() &&
3437 X_ret->getNumVectors() != static_cast<size_t>(colRng.size())) {
3438 lclSuccess = 0;
3439 }
3443 "X_ret->getNumVectors() != "
3444 "colRng.size(), on at least one MPI process in this MultiVector's "
3445 "communicator. This should never happen. "
3446 "Please report this bug to the Tpetra developers.");
3447 }
3448 }
3449 return X_ret;
3450}
3451
3452template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3453Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3455 subViewNonConst(const Teuchos::ArrayView<const size_t>& cols) {
3457 return Teuchos::rcp_const_cast<MV>(this->subView(cols));
3458}
3459
3460template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3461Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3463 subViewNonConst(const Teuchos::Range1D& colRng) {
3465 return Teuchos::rcp_const_cast<MV>(this->subView(colRng));
3466}
3467
3468template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3471 const size_t j)
3472 : base_type(X.getMap()) {
3473 using Kokkos::subview;
3474 typedef std::pair<size_t, size_t> range_type;
3475 const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3476
3477 const size_t numCols = X.getNumVectors();
3478 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(j >= numCols, std::invalid_argument, "Input index j (== " << j << ") exceeds valid column index range [0, " << numCols << " - 1].");
3479 const size_t jj = X.isConstantStride() ? static_cast<size_t>(j) : static_cast<size_t>(X.whichVectors_[j]);
3480 this->view_ = takeSubview(X.view_, Kokkos::ALL(), range_type(jj, jj + 1));
3481
3482 // mfh 31 Jul 2017: It would be unwise to execute concurrent
3483 // Export or Import operations with different subviews of a
3484 // MultiVector. Thus, it is safe to reuse communication buffers.
3485 // See #1560 discussion.
3486 //
3487 // We only need one column's worth of buffer for imports_ and
3488 // exports_. Taking subviews now ensures that their lengths will
3489 // be exactly what we need, so we won't have to resize them later.
3490 {
3491 const size_t newSize = X.imports_.extent(0) / numCols;
3492 const size_t offset = jj * newSize;
3493 auto device_view = subview(X.imports_.view_device(),
3494 range_type(offset, offset + newSize));
3495 auto host_view = subview(X.imports_.view_host(),
3496 range_type(offset, offset + newSize));
3497 this->imports_ = decltype(X.imports_)(device_view, host_view);
3498 }
3499 {
3500 const size_t newSize = X.exports_.extent(0) / numCols;
3501 const size_t offset = jj * newSize;
3502 auto device_view = subview(X.exports_.view_device(),
3503 range_type(offset, offset + newSize));
3504 auto host_view = subview(X.exports_.view_host(),
3505 range_type(offset, offset + newSize));
3506 this->exports_ = decltype(X.exports_)(device_view, host_view);
3507 }
3508 // These two DualViews already either have the right number of
3509 // entries, or zero entries. This means that we don't need to
3510 // resize them.
3511 this->numImportPacketsPerLID_ = X.numImportPacketsPerLID_;
3512 this->numExportPacketsPerLID_ = X.numExportPacketsPerLID_;
3513}
3514
3515template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3516Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3518 getVector(const size_t j) const {
3520 return Teuchos::rcp(new V(*this, j));
3521}
3522
3523template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3524Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3526 getVectorNonConst(const size_t j) {
3528 return Teuchos::rcp(new V(*this, j));
3529}
3530
3531template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3533 get1dCopy(const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const {
3534 using IST = impl_scalar_type;
3535 using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3536 Kokkos::HostSpace,
3537 Kokkos::MemoryUnmanaged>;
3538 const char tfecfFuncName[] = "get1dCopy: ";
3539
3540 const size_t numRows = this->getLocalLength();
3541 const size_t numCols = this->getNumVectors();
3542
3544 "LDA = " << LDA << " < numRows = " << numRows << ".");
3545 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numRows > size_t(0) && numCols > size_t(0) &&
3546 size_t(A.size()) < LDA * (numCols - 1) + numRows,
3547 std::runtime_error,
3548 "A.size() = " << A.size() << ", but its size must be at least "
3549 << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3550
3551 const std::pair<size_t, size_t> rowRange(0, numRows);
3552 const std::pair<size_t, size_t> colRange(0, numCols);
3553
3554 input_view_type A_view_orig(reinterpret_cast<IST*>(A.getRawPtr()),
3555 LDA, numCols);
3556 auto A_view = Kokkos::subview(A_view_orig, rowRange, colRange);
3557
3569 if (this->need_sync_host() && this->need_sync_device()) {
3572 throw std::runtime_error("Tpetra::MultiVector: A non-const view is alive outside and we cannot give a copy where host or device view can be modified outside");
3573 } else {
3574 const bool useHostView = view_.host_view_use_count() >= view_.device_view_use_count();
3575 if (this->isConstantStride()) {
3576 if (useHostView) {
3577 auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3578 // DEEP_COPY REVIEW - NOT TESTED
3579 Kokkos::deep_copy(A_view, srcView_host);
3580 } else {
3581 auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3582 // DEEP_COPY REVIEW - NOT TESTED
3583 Kokkos::deep_copy(A_view, srcView_device);
3584 }
3585 } else {
3586 for (size_t j = 0; j < numCols; ++j) {
3587 const size_t srcCol = this->whichVectors_[j];
3588 auto dstColView = Kokkos::subview(A_view, rowRange, j);
3589
3590 if (useHostView) {
3591 auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3592 auto srcColView_host = Kokkos::subview(srcView_host, rowRange, srcCol);
3593 // DEEP_COPY REVIEW - NOT TESTED
3594 Kokkos::deep_copy(dstColView, srcColView_host);
3595 } else {
3596 auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3597 auto srcColView_device = Kokkos::subview(srcView_device, rowRange, srcCol);
3598 // DEEP_COPY REVIEW - NOT TESTED
3599 Kokkos::deep_copy(dstColView, srcColView_device);
3600 }
3601 }
3602 }
3603 }
3604}
3605
3606template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3608 get2dCopy(const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const {
3610 const char tfecfFuncName[] = "get2dCopy: ";
3611 const size_t numRows = this->getLocalLength();
3612 const size_t numCols = this->getNumVectors();
3613
3615 static_cast<size_t>(ArrayOfPtrs.size()) != numCols,
3616 std::runtime_error,
3617 "Input array of pointers must contain as many "
3618 "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3619 << ArrayOfPtrs.size() << " != getNumVectors() = " << numCols << ".");
3620
3621 if (numRows != 0 && numCols != 0) {
3622 // No side effects until we've validated the input.
3623 for (size_t j = 0; j < numCols; ++j) {
3624 const size_t dstLen = static_cast<size_t>(ArrayOfPtrs[j].size());
3626 dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3627 "the input array of arrays is not long enough to fit all entries in "
3628 "that column of the MultiVector. ArrayOfPtrs[j].size() = "
3629 << dstLen << " < getLocalLength() = " << numRows << ".");
3630 }
3631
3632 // We've validated the input, so it's safe to start copying.
3633 for (size_t j = 0; j < numCols; ++j) {
3634 Teuchos::RCP<const V> X_j = this->getVector(j);
3635 const size_t LDA = static_cast<size_t>(ArrayOfPtrs[j].size());
3636 X_j->get1dCopy(ArrayOfPtrs[j], LDA);
3637 }
3638 }
3639}
3640
3641template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3642Teuchos::ArrayRCP<const Scalar>
3644 get1dView() const {
3645 if (getLocalLength() == 0 || getNumVectors() == 0) {
3646 return Teuchos::null;
3647 } else {
3649 !isConstantStride(), std::runtime_error,
3650 "Tpetra::MultiVector::"
3651 "get1dView: This MultiVector does not have constant stride, so it is "
3652 "not possible to view its data as a single array. You may check "
3653 "whether a MultiVector has constant stride by calling "
3654 "isConstantStride().");
3655 // Since get1dView() is and was always marked const, I have to
3656 // cast away const here in order not to break backwards
3657 // compatibility.
3658 auto X_lcl = getLocalViewHost(Access::ReadOnly);
3659 Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3660 Kokkos::Compat::persistingView(X_lcl);
3661 return Teuchos::arcp_reinterpret_cast<const Scalar>(dataAsArcp);
3662 }
3663}
3664
3665template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3666Teuchos::ArrayRCP<Scalar>
3669 if (this->getLocalLength() == 0 || this->getNumVectors() == 0) {
3670 return Teuchos::null;
3671 } else {
3672 TEUCHOS_TEST_FOR_EXCEPTION(!isConstantStride(), std::runtime_error,
3673 "Tpetra::MultiVector::"
3674 "get1dViewNonConst: This MultiVector does not have constant stride, "
3675 "so it is not possible to view its data as a single array. You may "
3676 "check whether a MultiVector has constant stride by calling "
3677 "isConstantStride().");
3678 auto X_lcl = getLocalViewHost(Access::ReadWrite);
3679 Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3680 Kokkos::Compat::persistingView(X_lcl);
3681 return Teuchos::arcp_reinterpret_cast<Scalar>(dataAsArcp);
3682 }
3683}
3684
3685template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3686Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3689 auto X_lcl = getLocalViewHost(Access::ReadWrite);
3690
3691 // Don't use the row range here on the outside, in order to avoid
3692 // a strided return type (in case Kokkos::subview is conservative
3693 // about that). Instead, use the row range for the column views
3694 // in the loop.
3695 const size_t myNumRows = this->getLocalLength();
3696 const size_t numCols = this->getNumVectors();
3697 const Kokkos::pair<size_t, size_t> rowRange(0, myNumRows);
3698
3699 Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views(numCols);
3700 for (size_t j = 0; j < numCols; ++j) {
3701 const size_t col = this->isConstantStride() ? j : this->whichVectors_[j];
3702 auto X_lcl_j = Kokkos::subview(X_lcl, rowRange, col);
3703 Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3704 Kokkos::Compat::persistingView(X_lcl_j);
3705 views[j] = Teuchos::arcp_reinterpret_cast<Scalar>(X_lcl_j_arcp);
3706 }
3707 return views;
3708}
3709
3710template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3713 getLocalViewHost(Access::ReadOnlyStruct s) const {
3714 return view_.getHostView(s);
3715}
3716
3717template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3720 getLocalViewHost(Access::ReadWriteStruct s) {
3721 return view_.getHostView(s);
3722}
3723
3724template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3727 getLocalViewHost(Access::OverwriteAllStruct s) {
3728 return view_.getHostView(s);
3729}
3730
3731template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3734 getLocalViewDevice(Access::ReadOnlyStruct s) const {
3735 return view_.getDeviceView(s);
3736}
3737
3738template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3741 getLocalViewDevice(Access::ReadWriteStruct s) {
3742 return view_.getDeviceView(s);
3743}
3744
3745template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3748 getLocalViewDevice(Access::OverwriteAllStruct s) {
3749 return view_.getDeviceView(s);
3750}
3751
3752template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3758
3759template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3760Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3762 get2dView() const {
3763 // Since get2dView() is and was always marked const, I have to
3764 // cast away const here in order not to break backwards
3765 // compatibility.
3766 auto X_lcl = getLocalViewHost(Access::ReadOnly);
3767
3768 // Don't use the row range here on the outside, in order to avoid
3769 // a strided return type (in case Kokkos::subview is conservative
3770 // about that). Instead, use the row range for the column views
3771 // in the loop.
3772 const size_t myNumRows = this->getLocalLength();
3773 const size_t numCols = this->getNumVectors();
3774 const Kokkos::pair<size_t, size_t> rowRange(0, myNumRows);
3775
3776 Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views(numCols);
3777 for (size_t j = 0; j < numCols; ++j) {
3778 const size_t col = this->isConstantStride() ? j : this->whichVectors_[j];
3779 auto X_lcl_j = Kokkos::subview(X_lcl, rowRange, col);
3780 Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
3781 Kokkos::Compat::persistingView(X_lcl_j);
3782 views[j] = Teuchos::arcp_reinterpret_cast<const Scalar>(X_lcl_j_arcp);
3783 }
3784 return views;
3785}
3786
3787template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3789 multiply(Teuchos::ETransp transA,
3790 Teuchos::ETransp transB,
3791 const Scalar& alpha,
3794 const Scalar& beta) {
3795 using std::endl;
3796 using Teuchos::CONJ_TRANS;
3797 using Teuchos::NO_TRANS;
3798 using Teuchos::RCP;
3799 using Teuchos::rcp;
3800 using Teuchos::rcpFromRef;
3801 using Teuchos::TRANS;
3802 using ::Tpetra::Details::ProfilingRegion;
3803 using ATS = KokkosKernels::ArithTraits<impl_scalar_type>;
3804 using LO = local_ordinal_type;
3805 using STS = Teuchos::ScalarTraits<Scalar>;
3807 const char tfecfFuncName[] = "multiply: ";
3808 ProfilingRegion region("Tpetra::MV::multiply");
3809
3810 // This routine performs a variety of matrix-matrix multiply
3811 // operations, interpreting the MultiVector (this-aka C , A and B)
3812 // as 2D matrices. Variations are due to the fact that A, B and C
3813 // can be locally replicated or globally distributed MultiVectors
3814 // and that we may or may not operate with the transpose of A and
3815 // B. Possible cases are:
3816 //
3817 // Operations # Cases Notes
3818 // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
3819 // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
3820 // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
3821 //
3822 // The following operations are not meaningful for 1-D
3823 // distributions:
3824 //
3825 // u1) C(local) = A^T(distr) * B^T(distr) 1
3826 // u2) C(local) = A (distr) * B^X(distr) 2
3827 // u3) C(distr) = A^X(local) * B^X(local) 4
3828 // u4) C(distr) = A^X(local) * B^X(distr) 4
3829 // u5) C(distr) = A^T(distr) * B^X(local) 2
3830 // u6) C(local) = A^X(distr) * B^X(local) 4
3831 // u7) C(distr) = A^X(distr) * B^X(local) 4
3832 // u8) C(local) = A^X(local) * B^X(distr) 4
3833 //
3834 // Total number of cases: 32 (= 2^5).
3835
3836 impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
3837
3838 const bool A_is_local = !A.isDistributed();
3839 const bool B_is_local = !B.isDistributed();
3840 const bool C_is_local = !this->isDistributed();
3841
3842 // In debug mode, check compatibility of local dimensions. We
3843 // only do this in debug mode, since it requires an all-reduce
3844 // to ensure correctness on all processses. It's entirely
3845 // possible that only some processes may have incompatible local
3846 // dimensions. Throwing an exception only on those processes
3847 // could cause this method to hang.
3848 const bool debug = ::Tpetra::Details::Behavior::debug();
3849 if (debug) {
3850 auto myMap = this->getMap();
3851 if (!myMap.is_null() && !myMap->getComm().is_null()) {
3852 using Teuchos::outArg;
3853 using Teuchos::REDUCE_MIN;
3854 using Teuchos::reduceAll;
3855
3856 auto comm = myMap->getComm();
3857 const size_t A_nrows =
3858 (transA != NO_TRANS) ? A.getNumVectors() : A.getLocalLength();
3859 const size_t A_ncols =
3860 (transA != NO_TRANS) ? A.getLocalLength() : A.getNumVectors();
3861 const size_t B_nrows =
3862 (transB != NO_TRANS) ? B.getNumVectors() : B.getLocalLength();
3863 const size_t B_ncols =
3864 (transB != NO_TRANS) ? B.getLocalLength() : B.getNumVectors();
3865
3866 const bool lclBad = this->getLocalLength() != A_nrows ||
3867 this->getNumVectors() != B_ncols || A_ncols != B_nrows;
3868
3869 const int myRank = comm->getRank();
3870 std::ostringstream errStrm;
3871 if (this->getLocalLength() != A_nrows) {
3872 errStrm << "Proc " << myRank << ": this->getLocalLength()="
3873 << this->getLocalLength() << " != A_nrows=" << A_nrows
3874 << "." << std::endl;
3875 }
3876 if (this->getNumVectors() != B_ncols) {
3877 errStrm << "Proc " << myRank << ": this->getNumVectors()="
3878 << this->getNumVectors() << " != B_ncols=" << B_ncols
3879 << "." << std::endl;
3880 }
3881 if (A_ncols != B_nrows) {
3882 errStrm << "Proc " << myRank << ": A_ncols="
3883 << A_ncols << " != B_nrows=" << B_nrows
3884 << "." << std::endl;
3885 }
3886
3887 const int lclGood = lclBad ? 0 : 1;
3888 int gblGood = 0;
3890 if (gblGood != 1) {
3891 std::ostringstream os;
3893
3894 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
3895 "Inconsistent local dimensions on at "
3896 "least one process in this object's communicator."
3897 << std::endl
3898 << "Operation: "
3899 << "C(" << (C_is_local ? "local" : "distr") << ") = "
3900 << alpha << "*A"
3901 << (transA == Teuchos::TRANS ? "^T" : (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3902 << "(" << (A_is_local ? "local" : "distr") << ") * "
3903 << beta << "*B"
3904 << (transA == Teuchos::TRANS ? "^T" : (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3905 << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
3906 << "Global dimensions: C(" << this->getGlobalLength() << ", "
3907 << this->getNumVectors() << "), A(" << A.getGlobalLength()
3908 << ", " << A.getNumVectors() << "), B(" << B.getGlobalLength()
3909 << ", " << B.getNumVectors() << ")." << std::endl
3910 << os.str());
3911 }
3912 }
3913 }
3914
3915 // Case 1: C(local) = A^X(local) * B^X(local)
3916 const bool Case1 = C_is_local && A_is_local && B_is_local;
3917 // Case 2: C(local) = A^T(distr) * B (distr)
3918 const bool Case2 = C_is_local && !A_is_local && !B_is_local &&
3919 transA != NO_TRANS &&
3920 transB == NO_TRANS;
3921 // Case 3: C(distr) = A (distr) * B^X(local)
3922 const bool Case3 = !C_is_local && !A_is_local && B_is_local &&
3923 transA == NO_TRANS;
3924
3925 // Test that we are considering a meaningful case
3926 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!Case1 && !Case2 && !Case3, std::runtime_error,
3927 "Multiplication of op(A) and op(B) into *this is not a "
3928 "supported use case.");
3929
3930 if (beta != STS::zero() && Case2) {
3931 // If Case2, then C is local and contributions must be summed
3932 // across all processes. However, if beta != 0, then accumulate
3933 // beta*C into the sum. When summing across all processes, we
3934 // only want to accumulate this once, so set beta == 0 on all
3935 // processes except Process 0.
3936 const int myRank = this->getMap()->getComm()->getRank();
3937 if (myRank != 0) {
3938 beta_local = ATS::zero();
3939 }
3940 }
3941
3942 // We only know how to do matrix-matrix multiplies if all the
3943 // MultiVectors have constant stride. If not, we have to make
3944 // temporary copies of those MultiVectors (including possibly
3945 // *this) that don't have constant stride.
3946 RCP<MV> C_tmp;
3947 if (!isConstantStride()) {
3948 C_tmp = rcp(new MV(*this, Teuchos::Copy)); // deep copy
3949 } else {
3950 C_tmp = rcp(this, false);
3951 }
3952
3954 if (!A.isConstantStride()) {
3955 A_tmp = rcp(new MV(A, Teuchos::Copy)); // deep copy
3956 } else {
3957 A_tmp = rcpFromRef(A);
3958 }
3959
3961 if (!B.isConstantStride()) {
3962 B_tmp = rcp(new MV(B, Teuchos::Copy)); // deep copy
3963 } else {
3964 B_tmp = rcpFromRef(B);
3965 }
3966
3967 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!C_tmp->isConstantStride() || !B_tmp->isConstantStride() ||
3968 !A_tmp->isConstantStride(),
3969 std::logic_error,
3970 "Failed to make temporary constant-stride copies of MultiVectors.");
3971
3972 {
3973 const LO A_lclNumRows = A_tmp->getLocalLength();
3974 const LO A_numVecs = A_tmp->getNumVectors();
3975 auto A_lcl = A_tmp->getLocalViewDevice(Access::ReadOnly);
3976 auto A_sub = Kokkos::subview(A_lcl,
3977 std::make_pair(LO(0), A_lclNumRows),
3978 std::make_pair(LO(0), A_numVecs));
3979
3980 const LO B_lclNumRows = B_tmp->getLocalLength();
3981 const LO B_numVecs = B_tmp->getNumVectors();
3982 auto B_lcl = B_tmp->getLocalViewDevice(Access::ReadOnly);
3983 auto B_sub = Kokkos::subview(B_lcl,
3984 std::make_pair(LO(0), B_lclNumRows),
3985 std::make_pair(LO(0), B_numVecs));
3986
3987 const LO C_lclNumRows = C_tmp->getLocalLength();
3988 const LO C_numVecs = C_tmp->getNumVectors();
3989
3990 auto C_lcl = C_tmp->getLocalViewDevice(Access::ReadWrite);
3991 auto C_sub = Kokkos::subview(C_lcl,
3992 std::make_pair(LO(0), C_lclNumRows),
3993 std::make_pair(LO(0), C_numVecs));
3994 const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' : (transA == Teuchos::TRANS ? 'T' : 'C'));
3995 const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' : (transB == Teuchos::TRANS ? 'T' : 'C'));
3997
3998 ProfilingRegion regionGemm("Tpetra::MV::multiply-call-gemm");
3999
4000 KokkosBlas::gemm(&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
4001 beta_local, C_sub);
4002 }
4003
4004 if (!isConstantStride()) {
4005 ::Tpetra::deep_copy(*this, *C_tmp); // Copy the result back into *this.
4006 }
4007
4008 // Dispose of (possibly) extra copies of A and B.
4009 A_tmp = Teuchos::null;
4010 B_tmp = Teuchos::null;
4011
4012 // If Case 2 then sum up *this and distribute it to all processes.
4013 if (Case2) {
4014 this->reduce();
4015 }
4016}
4017
4018template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4024 using Kokkos::ALL;
4025 using Kokkos::subview;
4026 const char tfecfFuncName[] = "elementWiseMultiply: ";
4027
4028 const size_t lclNumRows = this->getLocalLength();
4029 const size_t numVecs = this->getNumVectors();
4030
4031 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclNumRows != A.getLocalLength() || lclNumRows != B.getLocalLength(),
4032 std::runtime_error, "MultiVectors do not have the same local length.");
4034 numVecs != B.getNumVectors(), std::runtime_error,
4035 "this->getNumVectors"
4036 "() = "
4037 << numVecs << " != B.getNumVectors() = " << B.getNumVectors()
4038 << ".");
4039
4040 auto this_view = this->getLocalViewDevice(Access::ReadWrite);
4041 auto A_view = A.getLocalViewDevice(Access::ReadOnly);
4042 auto B_view = B.getLocalViewDevice(Access::ReadOnly);
4043
4044 if (isConstantStride() && B.isConstantStride()) {
4045 // A is just a Vector; it only has one column, so it always has
4046 // constant stride.
4047 //
4048 // If both *this and B have constant stride, we can do an
4049 // element-wise multiply on all columns at once.
4050 KokkosBlas::mult(scalarThis,
4051 this_view,
4052 scalarAB,
4053 subview(A_view, ALL(), 0),
4054 B_view);
4055 } else {
4056 for (size_t j = 0; j < numVecs; ++j) {
4057 const size_t C_col = isConstantStride() ? j : whichVectors_[j];
4058 const size_t B_col = B.isConstantStride() ? j : B.whichVectors_[j];
4059 KokkosBlas::mult(scalarThis,
4061 scalarAB,
4062 subview(A_view, ALL(), 0),
4063 subview(B_view, ALL(), B_col));
4064 }
4065 }
4066}
4067
4068template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4070 reduce() {
4071 using ::Tpetra::Details::allReduceView;
4072 using ::Tpetra::Details::ProfilingRegion;
4073 ProfilingRegion region("Tpetra::MV::reduce");
4074
4075 const auto map = this->getMap();
4076 if (map.get() == nullptr) {
4077 return;
4078 }
4079 const auto comm = map->getComm();
4080 if (comm.get() == nullptr) {
4081 return;
4082 }
4083
4084 // Avoid giving device buffers to MPI. Even if MPI handles them
4085 // correctly, doing so may not perform well.
4086 const bool changed_on_device = this->need_sync_host();
4087 if (changed_on_device) {
4088 // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4089 // and host will be separate allocations. In that case, it may
4090 // pay to do the all-reduce from device to host.
4091 Kokkos::fence("MultiVector::reduce"); // for UVM getLocalViewDevice is UVM which can be read as host by allReduceView, so we must not read until device is fenced
4092 auto X_lcl = this->getLocalViewDevice(Access::ReadWrite);
4093 allReduceView(X_lcl, X_lcl, *comm);
4094 } else {
4095 auto X_lcl = this->getLocalViewHost(Access::ReadWrite);
4096 allReduceView(X_lcl, X_lcl, *comm);
4097 }
4098}
4099
4100template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4103 const size_t col,
4106 const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4107 const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4110 std::runtime_error,
4111 "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4112 << " is invalid. The range of valid row indices on this process "
4113 << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4114 << ", " << maxLocalIndex << "].");
4116 vectorIndexOutOfRange(col),
4117 std::runtime_error,
4118 "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4119 << " of the multivector is invalid.");
4120 }
4121
4122 auto view = this->getLocalViewHost(Access::ReadWrite);
4123 const size_t colInd = isConstantStride() ? col : whichVectors_[col];
4124 view(lclRow, colInd) = ScalarValue;
4125}
4126
4127template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4130 const size_t col,
4131 const impl_scalar_type& value,
4132 const bool atomic) {
4134 const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4135 const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4138 std::runtime_error,
4139 "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4140 << " is invalid. The range of valid row indices on this process "
4141 << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4142 << ", " << maxLocalIndex << "].");
4144 vectorIndexOutOfRange(col),
4145 std::runtime_error,
4146 "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4147 << " of the multivector is invalid.");
4148 }
4149
4150 const size_t colInd = isConstantStride() ? col : whichVectors_[col];
4151
4152 auto view = this->getLocalViewHost(Access::ReadWrite);
4153 if (atomic) {
4154 Kokkos::atomic_add(&(view(lclRow, colInd)), value);
4155 } else {
4156 view(lclRow, colInd) += value;
4157 }
4158}
4159
4160template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4163 const size_t col,
4165 // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4166 // touches the RCP's reference count, which isn't thread safe.
4167 const LocalOrdinal lclRow = this->map_->getLocalElement(gblRow);
4168
4170 const char tfecfFuncName[] = "replaceGlobalValue: ";
4171 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid(),
4172 std::runtime_error,
4173 "Global row index " << gblRow << " is not present on this process "
4174 << this->getMap()->getComm()->getRank() << ".");
4175 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->vectorIndexOutOfRange(col), std::runtime_error,
4176 "Vector index " << col << " of the MultiVector is invalid.");
4177 }
4178
4179 this->replaceLocalValue(lclRow, col, ScalarValue);
4180}
4181
4182template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4185 const size_t col,
4186 const impl_scalar_type& value,
4187 const bool atomic) {
4188 // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4189 // touches the RCP's reference count, which isn't thread safe.
4190 const LocalOrdinal lclRow = this->map_->getLocalElement(globalRow);
4191
4194 lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid(),
4195 std::runtime_error,
4196 "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4197 << " is not present on this process "
4198 << this->getMap()->getComm()->getRank() << ".");
4200 vectorIndexOutOfRange(col),
4201 std::runtime_error,
4202 "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4203 << " of the multivector is invalid.");
4204 }
4205
4206 this->sumIntoLocalValue(lclRow, col, value, atomic);
4207}
4208
4209template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4210template <class T>
4211Teuchos::ArrayRCP<T>
4213 getSubArrayRCP(Teuchos::ArrayRCP<T> arr,
4214 size_t j) const {
4215 typedef Kokkos::DualView<impl_scalar_type*,
4216 typename dual_view_type::array_layout,
4219 const size_t col = isConstantStride() ? j : whichVectors_[j];
4221 Kokkos::subview(view_, Kokkos::ALL(), col);
4222 return Kokkos::Compat::persistingView(X_col.view_device());
4223}
4224
4225template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4227 need_sync_host() const {
4228 return view_.need_sync_host();
4229}
4230
4231template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4233 need_sync_device() const {
4234 return view_.need_sync_device();
4235}
4236
4237template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4238std::string
4240 descriptionImpl(const std::string& className) const {
4241 using Teuchos::TypeNameTraits;
4242
4243 std::ostringstream out;
4244 out << "\"" << className << "\": {";
4245 out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name()
4246 << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name()
4247 << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name()
4248 << ", Node" << Node::name()
4249 << "}, ";
4250 if (this->getObjectLabel() != "") {
4251 out << "Label: \"" << this->getObjectLabel() << "\", ";
4252 }
4253 out << ", numRows: " << this->getGlobalLength();
4254 if (className != "Tpetra::Vector") {
4255 out << ", numCols: " << this->getNumVectors()
4256 << ", isConstantStride: " << this->isConstantStride();
4257 }
4258 if (this->isConstantStride()) {
4259 out << ", columnStride: " << this->getStride();
4260 }
4261 out << "}";
4262
4263 return out.str();
4264}
4265
4266template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4267std::string
4269 description() const {
4270 return this->descriptionImpl("Tpetra::MultiVector");
4271}
4272
4273namespace Details {
4274template <typename ViewType>
4275void print_vector(Teuchos::FancyOStream& out, const char* prefix, const ViewType& v) {
4276 using std::endl;
4277 static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename ViewType::memory_space>::accessible,
4278 "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a host-accessible view.");
4279 static_assert(ViewType::rank == 2,
4280 "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a rank-2 view.");
4281 // The square braces [] and their contents are in Matlab
4282 // format, so users may copy and paste directly into Matlab.
4283 out << "Values(" << prefix << "): " << std::endl
4284 << "[";
4285 const size_t numRows = v.extent(0);
4286 const size_t numCols = v.extent(1);
4287 if (numCols == 1) {
4288 for (size_t i = 0; i < numRows; ++i) {
4289 out << v(i, 0);
4290 if (i + 1 < numRows) {
4291 out << "; ";
4292 }
4293 }
4294 } else {
4295 for (size_t i = 0; i < numRows; ++i) {
4296 for (size_t j = 0; j < numCols; ++j) {
4297 out << v(i, j);
4298 if (j + 1 < numCols) {
4299 out << ", ";
4300 }
4301 }
4302 if (i + 1 < numRows) {
4303 out << "; ";
4304 }
4305 }
4306 }
4307 out << "]" << endl;
4308}
4309} // namespace Details
4310
4311template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4312std::string
4314 localDescribeToString(const Teuchos::EVerbosityLevel vl) const {
4315 typedef LocalOrdinal LO;
4316 using std::endl;
4317
4318 if (vl <= Teuchos::VERB_LOW) {
4319 return std::string();
4320 }
4321 auto map = this->getMap();
4322 if (map.is_null()) {
4323 return std::string();
4324 }
4325 auto outStringP = Teuchos::rcp(new std::ostringstream());
4326 auto outp = Teuchos::getFancyOStream(outStringP);
4327 Teuchos::FancyOStream& out = *outp;
4328 auto comm = map->getComm();
4329 const int myRank = comm->getRank();
4330 const int numProcs = comm->getSize();
4331
4332 out << "Process " << myRank << " of " << numProcs << ":" << endl;
4333 Teuchos::OSTab tab1(out);
4334
4335 // VERB_MEDIUM and higher prints getLocalLength()
4336 const LO lclNumRows = static_cast<LO>(this->getLocalLength());
4337 out << "Local number of rows: " << lclNumRows << endl;
4338
4339 if (vl > Teuchos::VERB_MEDIUM) {
4340 // VERB_HIGH and higher prints isConstantStride() and getStride().
4341 // The first is only relevant if the Vector has multiple columns.
4342 if (this->getNumVectors() != static_cast<size_t>(1)) {
4343 out << "isConstantStride: " << this->isConstantStride() << endl;
4344 }
4345 if (this->isConstantStride()) {
4346 out << "Column stride: " << this->getStride() << endl;
4347 }
4348
4349 if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4350 // VERB_EXTREME prints values. If we're not doing univied memory, we'll
4352 // so we can't use our regular accessor functins
4353
4354 // NOTE: This is an occasion where we do *not* want the auto-sync stuff
4355 // to trigger (since this function is conceptually const). Thus, we
4356 // get *copies* of the view's data instead.
4357 auto X_dev = view_.getDeviceCopy();
4358 auto X_host = view_.getHostCopy();
4359
4360 if (X_dev.data() == X_host.data()) {
4361 // One single allocation
4362 Details::print_vector(out, "unified", X_host);
4363 } else {
4364 Details::print_vector(out, "host", X_host);
4365 Details::print_vector(out, "dev", X_dev);
4366 }
4367 }
4368 }
4369 out.flush(); // make sure the ostringstream got everything
4370 return outStringP->str();
4371}
4372
4373template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4375 describeImpl(Teuchos::FancyOStream& out,
4376 const std::string& className,
4377 const Teuchos::EVerbosityLevel verbLevel) const {
4378 using std::endl;
4379 using Teuchos::TypeNameTraits;
4380 using Teuchos::VERB_DEFAULT;
4381 using Teuchos::VERB_LOW;
4382 using Teuchos::VERB_NONE;
4383 const Teuchos::EVerbosityLevel vl =
4385
4386 if (vl == VERB_NONE) {
4387 return; // don't print anything
4388 }
4389 // If this Vector's Comm is null, then the Vector does not
4390 // participate in collective operations with the other processes.
4391 // In that case, it is not even legal to call this method. The
4392 // reasonable thing to do in that case is nothing.
4393 auto map = this->getMap();
4394 if (map.is_null()) {
4395 return;
4396 }
4397 auto comm = map->getComm();
4398 if (comm.is_null()) {
4399 return;
4400 }
4401
4402 const int myRank = comm->getRank();
4403
4404 // Only Process 0 should touch the output stream, but this method
4405 // in general may need to do communication. Thus, we may need to
4406 // preserve the current tab level across multiple "if (myRank ==
4407 // 0) { ... }" inner scopes. This is why we sometimes create
4408 // OSTab instances by pointer, instead of by value. We only need
4409 // to create them by pointer if the tab level must persist through
4410 // multiple inner scopes.
4411 Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4412
4413 // VERB_LOW and higher prints the equivalent of description().
4414 if (myRank == 0) {
4415 tab0 = Teuchos::rcp(new Teuchos::OSTab(out));
4416 out << "\"" << className << "\":" << endl;
4417 tab1 = Teuchos::rcp(new Teuchos::OSTab(out));
4418 {
4419 out << "Template parameters:" << endl;
4420 Teuchos::OSTab tab2(out);
4421 out << "Scalar: " << TypeNameTraits<Scalar>::name() << endl
4422 << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name() << endl
4423 << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name() << endl
4424 << "Node: " << Node::name() << endl;
4425 }
4426 if (this->getObjectLabel() != "") {
4427 out << "Label: \"" << this->getObjectLabel() << "\", ";
4428 }
4429 out << "Global number of rows: " << this->getGlobalLength() << endl;
4430 if (className != "Tpetra::Vector") {
4431 out << "Number of columns: " << this->getNumVectors() << endl;
4432 }
4433 // getStride() may differ on different processes, so it (and
4434 // isConstantStride()) properly belong to per-process data.
4435 }
4436
4437 // This is collective over the Map's communicator.
4438 if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4439 const std::string lclStr = this->localDescribeToString(vl);
4441 }
4442}
4443
4444template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4446 describe(Teuchos::FancyOStream& out,
4447 const Teuchos::EVerbosityLevel verbLevel) const {
4448 this->describeImpl(out, "Tpetra::MultiVector", verbLevel);
4449}
4450
4451template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4453 removeEmptyProcessesInPlace(const Teuchos::RCP<const map_type>& newMap) {
4454 replaceMap(newMap);
4455}
4456
4457template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4460 using ::Tpetra::Details::localDeepCopy;
4461 const char prefix[] = "Tpetra::MultiVector::assign: ";
4462
4463 TEUCHOS_TEST_FOR_EXCEPTION(this->getGlobalLength() != src.getGlobalLength() ||
4464 this->getNumVectors() != src.getNumVectors(),
4465 std::invalid_argument,
4466 prefix << "Global dimensions of the two Tpetra::MultiVector "
4467 "objects do not match. src has dimensions ["
4468 << src.getGlobalLength()
4469 << "," << src.getNumVectors() << "], and *this has dimensions ["
4470 << this->getGlobalLength() << "," << this->getNumVectors() << "].");
4471 // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4472 TEUCHOS_TEST_FOR_EXCEPTION(this->getLocalLength() != src.getLocalLength(), std::invalid_argument,
4473 prefix << "The local row counts of the two Tpetra::MultiVector "
4474 "objects do not match. src has "
4475 << src.getLocalLength() << " row(s) "
4476 << " and *this has " << this->getLocalLength() << " row(s).");
4477
4478 // See #1510. We're writing to *this, so we don't care about its
4479 // contents in either memory space, and we don't want
4480 // DualView::modify to complain about "concurrent modification" of
4481 // host and device Views.
4482
4483 // KJ: this is problematic. assign funtion is used to construct a subvector
4484 // if the sync flag is reset here, it lose all our control over getLocalView interface
4485 // this->clear_sync_state();
4486
4487 // If need sync to device, then host has most recent version.
4488 const bool src_last_updated_on_host = src.need_sync_device();
4489
4491 localDeepCopy(this->getLocalViewHost(Access::ReadWrite),
4492 src.getLocalViewHost(Access::ReadOnly),
4493 this->isConstantStride(),
4494 src.isConstantStride(),
4495 this->whichVectors_(),
4496 src.whichVectors_());
4497 } else {
4498 localDeepCopy(this->getLocalViewDevice(Access::ReadWrite),
4499 src.getLocalViewDevice(Access::ReadOnly),
4500 this->isConstantStride(),
4501 src.isConstantStride(),
4502 this->whichVectors_(),
4503 src.whichVectors_());
4504 }
4505}
4506
4507template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4508template <class T>
4509Teuchos::RCP<MultiVector<T, LocalOrdinal, GlobalOrdinal, Node> >
4511 convert() const {
4512 using Teuchos::RCP;
4513
4514 auto newMV = Teuchos::rcp(new MultiVector<T, LocalOrdinal, GlobalOrdinal, Node>(this->getMap(),
4515 this->getNumVectors(),
4516 /*zeroOut=*/false));
4517 Tpetra::deep_copy(*newMV, *this);
4518 return newMV;
4519}
4520
4521template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4524 using ::Tpetra::Details::PackTraits;
4525
4526 const size_t l1 = this->getLocalLength();
4527 const size_t l2 = vec.getLocalLength();
4528 if ((l1 != l2) || (this->getNumVectors() != vec.getNumVectors())) {
4529 return false;
4530 }
4531
4532 return true;
4533}
4534
4535template <class ST, class LO, class GO, class NT>
4538 std::swap(mv.map_, this->map_);
4539 std::swap(mv.view_, this->view_);
4540 std::swap(mv.whichVectors_, this->whichVectors_);
4541}
4542
4543#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4544template <class ST, class LO, class GO, class NT>
4546 const Teuchos::SerialDenseMatrix<int, ST>& src) {
4547 using ::Tpetra::Details::localDeepCopy;
4548 using MV = MultiVector<ST, LO, GO, NT>;
4549 using IST = typename MV::impl_scalar_type;
4550 using input_view_type =
4551 Kokkos::View<const IST**, Kokkos::LayoutLeft,
4552 Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4553 using pair_type = std::pair<LO, LO>;
4554
4555 const LO numRows = static_cast<LO>(src.numRows());
4556 const LO numCols = static_cast<LO>(src.numCols());
4557
4558 TEUCHOS_TEST_FOR_EXCEPTION(numRows != static_cast<LO>(dst.getLocalLength()) ||
4559 numCols != static_cast<LO>(dst.getNumVectors()),
4560 std::invalid_argument, "Tpetra::deep_copy: On Process " << dst.getMap()->getComm()->getRank() << ", dst is " << dst.getLocalLength() << " x " << dst.getNumVectors() << ", but src is " << numRows << " x " << numCols << ".");
4561
4562 const IST* src_raw = reinterpret_cast<const IST*>(src.values());
4563 input_view_type src_orig(src_raw, src.stride(), numCols);
4564 input_view_type src_view =
4565 Kokkos::subview(src_orig, pair_type(0, numRows), Kokkos::ALL());
4566
4567 constexpr bool src_isConstantStride = true;
4568 Teuchos::ArrayView<const size_t> srcWhichVectors(nullptr, 0);
4569 localDeepCopy(dst.getLocalViewHost(Access::ReadWrite),
4570 src_view,
4571 dst.isConstantStride(),
4573 getMultiVectorWhichVectors(dst),
4575}
4576
4577template <class ST, class LO, class GO, class NT>
4578void deep_copy(Teuchos::SerialDenseMatrix<int, ST>& dst,
4579 const MultiVector<ST, LO, GO, NT>& src) {
4580 using ::Tpetra::Details::localDeepCopy;
4581 using MV = MultiVector<ST, LO, GO, NT>;
4582 using IST = typename MV::impl_scalar_type;
4583 using output_view_type =
4584 Kokkos::View<IST**, Kokkos::LayoutLeft,
4585 Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4586 using pair_type = std::pair<LO, LO>;
4587
4588 const LO numRows = static_cast<LO>(dst.numRows());
4589 const LO numCols = static_cast<LO>(dst.numCols());
4590
4591 TEUCHOS_TEST_FOR_EXCEPTION(numRows != static_cast<LO>(src.getLocalLength()) ||
4592 numCols != static_cast<LO>(src.getNumVectors()),
4593 std::invalid_argument, "Tpetra::deep_copy: On Process " << src.getMap()->getComm()->getRank() << ", src is " << src.getLocalLength() << " x " << src.getNumVectors() << ", but dst is " << numRows << " x " << numCols << ".");
4594
4595 IST* dst_raw = reinterpret_cast<IST*>(dst.values());
4596 output_view_type dst_orig(dst_raw, dst.stride(), numCols);
4597 auto dst_view =
4598 Kokkos::subview(dst_orig, pair_type(0, numRows), Kokkos::ALL());
4599
4600 constexpr bool dst_isConstantStride = true;
4601 Teuchos::ArrayView<const size_t> dstWhichVectors(nullptr, 0);
4602
4603 // Prefer the host version of src's data.
4604 localDeepCopy(dst_view,
4605 src.getLocalViewHost(Access::ReadOnly),
4607 src.isConstantStride(),
4609 getMultiVectorWhichVectors(src));
4610}
4611#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4612
4613template <class Scalar, class LO, class GO, class NT>
4614Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4615createMultiVector(const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4616 size_t numVectors) {
4618 return Teuchos::rcp(new MV(map, numVectors));
4619}
4620
4621template <class ST, class LO, class GO, class NT>
4624 typedef MultiVector<ST, LO, GO, NT> MV;
4625 MV cpy(src.getMap(), src.getNumVectors(), false);
4626 cpy.assign(src);
4627 return cpy;
4628}
4629
4630} // namespace Tpetra
4631
4632//
4633// Explicit instantiation macro
4634//
4635// Must be expanded from within the Tpetra namespace!
4636//
4637
4638#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4639#define TPETRA_MULTIVECTOR_INSTANT(SCALAR, LO, GO, NODE) \
4640 template class MultiVector<SCALAR, LO, GO, NODE>; \
4641 template MultiVector<SCALAR, LO, GO, NODE> createCopy(const MultiVector<SCALAR, LO, GO, NODE>& src); \
4642 template Teuchos::RCP<MultiVector<SCALAR, LO, GO, NODE> > createMultiVector(const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4643 template void deep_copy(MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4644 template void deep_copy(Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4645
4646#else
4647#define TPETRA_MULTIVECTOR_INSTANT(SCALAR, LO, GO, NODE) \
4648 template class MultiVector<SCALAR, LO, GO, NODE>; \
4649 template MultiVector<SCALAR, LO, GO, NODE> createCopy(const MultiVector<SCALAR, LO, GO, NODE>& src);
4650
4651#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4652
4653#define TPETRA_MULTIVECTOR_CONVERT_INSTANT(SO, SI, LO, GO, NODE) \
4654 \
4655 template Teuchos::RCP<MultiVector<SO, LO, GO, NODE> > \
4656 MultiVector<SI, LO, GO, NODE>::convert<SO>() const;
4657
4658#endif // TPETRA_MULTIVECTOR_DEF_HPP
Functions for initializing and finalizing Tpetra.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration of functions for checking whether a given pointer is accessible from a given Kokkos execu...
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
Declaration of a function that prints strings from each process.
Declaration of Tpetra::Details::isInterComm.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
static bool assumeMpiIsGPUAware()
Whether to assume that MPI is CUDA aware.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
static size_t multivectorKernelLocationThreshold()
the threshold for transitioning from device to host
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool remoteLIDsContiguous=false, const CombineMode CM=INSERT)
Reallocate imports_ if needed.
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
A parallel distribution of indices over processes.
One or more distributed dense vectors.
void reciprocal(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,...
void normInf(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the infinity-norm of each vector (column), storing the result in a host View.
void get1dCopy(const Teuchos::ArrayView< Scalar > &A, const size_t LDA) const
Fill the given array with a copy of this multivector's local values.
size_t getStride() const
Stride between columns in the multivector.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool areRemoteLIDsContiguous=false, const CombineMode CM=INSERT) override
Reallocate imports_ if needed.
void reduce()
Sum values of a locally replicated multivector across all processes.
virtual bool checkSizes(const SrcDistObject &sourceObj) override
Whether data redistribution between sourceObj and this object is legal.
void randomize()
Set all values in the multivector to pseudorandom numbers.
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subView(const Teuchos::Range1D &colRng) const
Return a const MultiVector with const views of selected columns.
void dot(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Teuchos::ArrayView< dot_type > &dots) const
Compute the dot product of each corresponding pair of vectors (columns) in A and B.
void describeImpl(Teuchos::FancyOStream &out, const std::string &className, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Implementation of describe() for this class, and its subclass Vector.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap) override
Remove processes owning zero rows from the Map and their communicator.
Teuchos::ArrayRCP< Scalar > get1dViewNonConst()
Nonconst persisting (1-D) view of this multivector's local values.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
Teuchos::RCP< Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVectorNonConst(const size_t j)
Return a Vector which is a nonconst view of column j.
void meanValue(const Teuchos::ArrayView< impl_scalar_type > &means) const
Compute mean (average) value of each column.
std::string descriptionImpl(const std::string &className) const
Implementation of description() for this class, and its subclass Vector.
void multiply(Teuchos::ETransp transA, Teuchos::ETransp transB, const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Scalar &beta)
Matrix-matrix multiplication: this = beta*this + alpha*op(A)*op(B).
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
wrapped_dual_view_type view_
The Kokkos::DualView containing the MultiVector's data.
void replaceLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using local (row) index.
void swap(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &mv)
Swap contents of mv with contents of *this.
size_t getLocalLength() const
Local number of rows on the calling process.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subCopy(const Teuchos::Range1D &colRng) const
Return a MultiVector with copies of selected columns.
typename Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner ("dot") product result.
Teuchos::ArrayRCP< const Scalar > getData(size_t j) const
Const view of the local values in a particular vector of this multivector.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM, const execution_space &space) override
Same as copyAndPermute, but do operations in space.
void norm2(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the two-norm of each vector (column), storing the result in a host View.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
Teuchos::ArrayRCP< const Scalar > get1dView() const
Const persisting (1-D) view of this multivector's local values.
Teuchos::ArrayRCP< T > getSubArrayRCP(Teuchos::ArrayRCP< T > arr, size_t j) const
Persisting view of j-th column in the given ArrayRCP.
void replaceGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using global row index.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, device_type > dual_view_type
Kokkos::DualView specialization used by this class.
typename KokkosKernels::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
void elementWiseMultiply(Scalar scalarAB, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarThis)
Multiply a Vector A elementwise by a MultiVector B.
size_t getNumVectors() const
Number of columns in the multivector.
virtual size_t constantNumberOfPackets() const override
Number of packets to send per LID.
wrapped_dual_view_type getWrappedDualView() const
Return the wrapped dual view holding this MultiVector's local data.
void sumIntoLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &val, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using local row index.
void replaceMap(const Teuchos::RCP< const map_type > &map)
Replace the underlying Map in place.
Teuchos::RCP< MultiVector< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another MultiVector with the same entries, but converted to a different Scalar type T.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
void abs(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise absolute values of input Multi-vector in target: A = abs(this)
void norm1(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the one-norm of each vector (column), storing the result in a host view.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const override
Print the object with the given verbosity level to a FancyOStream.
void get2dCopy(const Teuchos::ArrayView< const Teuchos::ArrayView< Scalar > > &ArrayOfPtrs) const
Fill the given array with a copy of this multivector's local values.
void sumIntoGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using global row index.
bool aliases(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &other) const
Whether this multivector's memory might alias other. This is conservative: if either this or other is...
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on host. This requires that ther...
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subViewNonConst(const Teuchos::Range1D &colRng)
Return a MultiVector with views of selected columns.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
Teuchos::ArrayRCP< Scalar > getDataNonConst(size_t j)
View of the local values in a particular vector of this multivector.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
size_t getOrigNumLocalCols() const
"Original" number of columns in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< Scalar > > get2dViewNonConst()
Return non-const persisting pointers to values.
virtual std::string description() const override
A simple one-line description of this object.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetViewNonConst(const Teuchos::RCP< const map_type > &subMap, const size_t offset)
Return a nonconst view of a subset of rows.
std::string localDescribeToString(const Teuchos::EVerbosityLevel vl) const
Print the calling process' verbose describe() information to the returned string.
bool isSameSize(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &vec) const
size_t getOrigNumLocalRows() const
"Original" number of rows in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< const Scalar > > get2dView() const
Return const persisting pointers to values.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
typename KokkosKernels::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
Abstract base class for objects that can be the source of an Import or Export operation.
Implementation details of Tpetra.
int local_ordinal_type
Default value of Scalar template parameter.
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
bool isInterComm(const Teuchos::Comm< int > &)
Return true if and only if the input communicator wraps an MPI intercommunicator.
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector "constructor": Create a MultiVector from a given Map.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes)
@ INSERT
Insert new values that don't currently exist.