Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_MULTIVECTOR_DEF_HPP
11#define TPETRA_MULTIVECTOR_DEF_HPP
12
20
21#include "Tpetra_Core.hpp"
22#include "Tpetra_Util.hpp"
23#include "Tpetra_Vector.hpp"
35#include "Tpetra_Details_Random.hpp"
36#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
37#include "Teuchos_SerialDenseMatrix.hpp"
38#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
39#include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
40#include "KokkosCompat_View.hpp"
41#include "KokkosBlas.hpp"
42#include "KokkosKernels_Utils.hpp"
43#include "Kokkos_Random.hpp"
44#if KOKKOS_VERSION >= 40799
45#include "KokkosKernels_ArithTraits.hpp"
46#else
47#include "Kokkos_ArithTraits.hpp"
48#endif
49#include <memory>
50#include <sstream>
51
52#ifdef HAVE_TPETRA_INST_FLOAT128
53namespace Kokkos {
54// FIXME (mfh 04 Sep 2015) Just a stub for now!
55template <class Generator>
56struct rand<Generator, __float128> {
57 static KOKKOS_INLINE_FUNCTION __float128 max() {
58 return static_cast<__float128>(1.0);
59 }
60 static KOKKOS_INLINE_FUNCTION __float128
61 draw(Generator& gen) {
62 // Half the smallest normalized double, is the scaling factor of
63 // the lower-order term in the double-double representation.
64 const __float128 scalingFactor =
65 static_cast<__float128>(std::numeric_limits<double>::min()) /
66 static_cast<__float128>(2.0);
67 const __float128 higherOrderTerm = static_cast<__float128>(gen.drand());
68 const __float128 lowerOrderTerm =
69 static_cast<__float128>(gen.drand()) * scalingFactor;
70 return higherOrderTerm + lowerOrderTerm;
71 }
72 static KOKKOS_INLINE_FUNCTION __float128
73 draw(Generator& gen, const __float128& range) {
74 // FIXME (mfh 05 Sep 2015) Not sure if this is right.
75 const __float128 scalingFactor =
76 static_cast<__float128>(std::numeric_limits<double>::min()) /
77 static_cast<__float128>(2.0);
78 const __float128 higherOrderTerm =
79 static_cast<__float128>(gen.drand(range));
80 const __float128 lowerOrderTerm =
81 static_cast<__float128>(gen.drand(range)) * scalingFactor;
82 return higherOrderTerm + lowerOrderTerm;
83 }
84 static KOKKOS_INLINE_FUNCTION __float128
85 draw(Generator& gen, const __float128& start, const __float128& end) {
86 // FIXME (mfh 05 Sep 2015) Not sure if this is right.
87 const __float128 scalingFactor =
88 static_cast<__float128>(std::numeric_limits<double>::min()) /
89 static_cast<__float128>(2.0);
90 const __float128 higherOrderTerm =
91 static_cast<__float128>(gen.drand(start, end));
92 const __float128 lowerOrderTerm =
93 static_cast<__float128>(gen.drand(start, end)) * scalingFactor;
94 return higherOrderTerm + lowerOrderTerm;
95 }
96};
97} // namespace Kokkos
98#endif // HAVE_TPETRA_INST_FLOAT128
99
100namespace { // (anonymous)
101
116template <class ST, class LO, class GO, class NT>
118allocDualView(const size_t lclNumRows,
119 const size_t numCols,
120 const bool zeroOut = true,
121 const bool allowPadding = false) {
122 using Kokkos::AllowPadding;
123 using Kokkos::view_alloc;
124 using Kokkos::WithoutInitializing;
125 using ::Tpetra::Details::Behavior;
126 typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
127 typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::wrapped_dual_view_type wrapped_dual_view_type;
128 typedef typename dual_view_type::t_dev dev_view_type;
129
130 // This needs to be a string and not a char*, if given as an
131 // argument to Kokkos::view_alloc. This is because view_alloc
132 // also allows a raw pointer as its first argument. See
133 // https://github.com/kokkos/kokkos/issues/434.
134 const std::string label("MV::DualView");
135 const bool debug = Behavior::debug();
136
137 // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
138 // creation of the DualView's Views works around
139 // Kokkos::DualView's current inability to accept an
140 // AllocationProperties initial argument (as Kokkos::View does).
141 // However, the work-around is harmless, since it does what the
142 // (currently nonexistent) equivalent DualView constructor would
143 // have done anyway.
144
145 dev_view_type d_view;
146 if (zeroOut) {
147 if (allowPadding) {
148 d_view = dev_view_type(view_alloc(label, AllowPadding),
149 lclNumRows, numCols);
150 } else {
151 d_view = dev_view_type(view_alloc(label),
152 lclNumRows, numCols);
153 }
154 } else {
155 if (allowPadding) {
156 d_view = dev_view_type(view_alloc(label,
157 WithoutInitializing,
158 AllowPadding),
159 lclNumRows, numCols);
160 } else {
161 d_view = dev_view_type(view_alloc(label, WithoutInitializing),
162 lclNumRows, numCols);
163 }
164 if (debug) {
165 // Filling with NaN is a cheap and effective way to tell if
166 // downstream code is trying to use a MultiVector's data
167 // without them having been initialized. ArithTraits lets us
168 // call nan() even if the scalar type doesn't define it; it
169 // just returns some undefined value in the latter case. This
170 // won't hurt anything because by setting zeroOut=false, users
171 // already agreed that they don't care about the contents of
172 // the MultiVector.
173#if KOKKOS_VERSION >= 40799
174 const ST nan = KokkosKernels::ArithTraits<ST>::nan();
175#else
176 const ST nan = Kokkos::ArithTraits<ST>::nan();
177#endif
178 KokkosBlas::fill(d_view, nan);
179 }
180 }
181 if (debug) {
182 TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(d_view.extent(0)) != lclNumRows ||
183 static_cast<size_t>(d_view.extent(1)) != numCols,
184 std::logic_error,
185 "allocDualView: d_view's dimensions actual dimensions do not match "
186 "requested dimensions. d_view is "
187 << d_view.extent(0) << " x " << d_view.extent(1) << "; requested " << lclNumRows << " x " << numCols
188 << ". Please report this bug to the Tpetra developers.");
189 }
190
191 return wrapped_dual_view_type(d_view);
192}
193
194// Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
195//
196// T: The type of the entries of the View.
197// ExecSpace: The Kokkos execution space.
198template <class T, class ExecSpace>
199struct MakeUnmanagedView {
200 // The 'false' part of the branch carefully ensures that this
201 // won't attempt to use a host execution space that hasn't been
202 // initialized. For example, if Kokkos::OpenMP is disabled and
203 // Kokkos::Threads is enabled, the latter is always the default
204 // execution space of Kokkos::HostSpace, even when ExecSpace is
205 // Kokkos::Serial. That's why we go through the trouble of asking
206 // Kokkos::DualView what _its_ space is. That seems to work
207 // around this default execution space issue.
208 //
209 typedef typename std::conditional<
210 Kokkos::SpaceAccessibility<
211 typename ExecSpace::memory_space,
212 Kokkos::HostSpace>::accessible,
213 typename ExecSpace::device_type,
214 typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
215 typedef Kokkos::LayoutLeft array_layout;
216 typedef Kokkos::View<T*, array_layout, host_exec_space,
217 Kokkos::MemoryUnmanaged>
218 view_type;
219
220 static view_type getView(const Teuchos::ArrayView<T>& x_in) {
221 const size_t numEnt = static_cast<size_t>(x_in.size());
222 if (numEnt == 0) {
223 return view_type();
224 } else {
225 return view_type(x_in.getRawPtr(), numEnt);
226 }
227 }
228};
229
230template <class WrappedDualViewType>
231WrappedDualViewType
232takeSubview(const WrappedDualViewType& X,
233 const std::pair<size_t, size_t>& rowRng,
234 const Kokkos::ALL_t& colRng)
235
236{
237 // The bug we saw below should be harder to trigger here.
238 return WrappedDualViewType(X, rowRng, colRng);
239}
240
241template <class WrappedDualViewType>
242WrappedDualViewType
243takeSubview(const WrappedDualViewType& X,
244 const Kokkos::ALL_t& rowRng,
245 const std::pair<size_t, size_t>& colRng) {
246 using DualViewType = typename WrappedDualViewType::DVT;
247 // Look carefullly at the comment in the below version of this function.
248 // The comment applies here as well.
249 if (X.extent(0) == 0 && X.extent(1) != 0) {
250 return WrappedDualViewType(DualViewType("MV::DualView", 0, colRng.second - colRng.first));
251 } else {
252 return WrappedDualViewType(X, rowRng, colRng);
253 }
254}
255
256template <class WrappedDualViewType>
257WrappedDualViewType
258takeSubview(const WrappedDualViewType& X,
259 const std::pair<size_t, size_t>& rowRng,
260 const std::pair<size_t, size_t>& colRng) {
261 using DualViewType = typename WrappedDualViewType::DVT;
262 // If you take a subview of a view with zero rows Kokkos::subview()
263 // always returns a DualView with the same data pointers. This will break
264 // pointer equality testing in between two subviews of the same 2D View if
265 // it has zero row extent. While the one (known) case where this was actually used
266 // has been fixed, that sort of check could very easily be reintroduced in the future,
267 // hence I've added this if check here.
268 //
269 // This is not a bug in Kokkos::subview(), just some very subtle behavior which
270 // future developers should be wary of.
271 if (X.extent(0) == 0 && X.extent(1) != 0) {
272 return WrappedDualViewType(DualViewType("MV::DualView", 0, colRng.second - colRng.first));
273 } else {
274 return WrappedDualViewType(X, rowRng, colRng);
275 }
276}
277
278template <class WrappedOrNotDualViewType>
279size_t
280getDualViewStride(const WrappedOrNotDualViewType& dv) {
281 // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
282 // method yet, but its Views do.
283 // NOTE: dv.stride() returns a vector of length one
284 // more than its rank
285 size_t strides[WrappedOrNotDualViewType::t_dev::rank + 1];
286 dv.stride(strides);
287 const size_t LDA = strides[1];
288 const size_t numRows = dv.extent(0);
289
290 if (LDA == 0) {
291 return (numRows == 0) ? size_t(1) : numRows;
292 } else {
293 return LDA;
294 }
295}
296
297template <class ViewType>
298size_t
299getViewStride(const ViewType& view) {
300 const size_t LDA = view.stride(1);
301 const size_t numRows = view.extent(0);
302
303 if (LDA == 0) {
304 return (numRows == 0) ? size_t(1) : numRows;
305 } else {
306 return LDA;
307 }
308}
309
310template <class impl_scalar_type, class buffer_device_type>
311bool runKernelOnHost(
312 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports) {
313 if (!imports.need_sync_device()) {
314 return false; // most up-to-date on device
315 } else { // most up-to-date on host,
316 // but if large enough, worth running on device anyway
317 size_t localLengthThreshold =
319 return imports.extent(0) <= localLengthThreshold;
320 }
321}
322
323template <class SC, class LO, class GO, class NT>
324bool runKernelOnHost(const ::Tpetra::MultiVector<SC, LO, GO, NT>& X) {
325 if (!X.need_sync_device()) {
326 return false; // most up-to-date on device
327 } else { // most up-to-date on host
328 // but if large enough, worth running on device anyway
329 size_t localLengthThreshold =
331 return X.getLocalLength() <= localLengthThreshold;
332 }
333}
334
335template <class SC, class LO, class GO, class NT>
336void multiVectorNormImpl(typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
338 const ::Tpetra::Details::EWhichNorm whichNorm) {
339 using namespace Tpetra;
340 using ::Tpetra::Details::normImpl;
342 using val_type = typename MV::impl_scalar_type;
343 using mag_type = typename MV::mag_type;
344 using dual_view_type = typename MV::dual_view_type;
345
346 auto map = X.getMap();
347 auto comm = map.is_null() ? nullptr : map->getComm().getRawPtr();
348 auto whichVecs = getMultiVectorWhichVectors(X);
349 const bool isConstantStride = X.isConstantStride();
350 const bool isDistributed = X.isDistributed();
351
352 const bool runOnHost = runKernelOnHost(X);
353 if (runOnHost) {
354 using view_type = typename dual_view_type::t_host;
355 using array_layout = typename view_type::array_layout;
356 using device_type = typename view_type::device_type;
357
358 auto X_lcl = X.getLocalViewHost(Access::ReadOnly);
359 normImpl<val_type, array_layout, device_type,
360 mag_type>(norms, X_lcl, whichNorm, whichVecs,
361 isConstantStride, isDistributed, comm);
362 } else {
363 using view_type = typename dual_view_type::t_dev;
364 using array_layout = typename view_type::array_layout;
365 using device_type = typename view_type::device_type;
366
367 auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
368 normImpl<val_type, array_layout, device_type,
369 mag_type>(norms, X_lcl, whichNorm, whichVecs,
370 isConstantStride, isDistributed, comm);
371 }
372}
373} // namespace
374
375namespace Tpetra {
376
377namespace Details {
378template <typename DstView, typename SrcView>
379struct AddAssignFunctor {
380 // This functor would be better as a lambda, but CUDA cannot compile
381 // lambdas in protected functions. It compiles fine with the functor.
382 AddAssignFunctor(DstView& tgt_, SrcView& src_)
383 : tgt(tgt_)
384 , src(src_) {}
385
387 operator()(const size_t k) const { tgt(k) += src(k); }
388
389 DstView tgt;
390 SrcView src;
391};
392} // namespace Details
393
394template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
396 vectorIndexOutOfRange(const size_t VectorIndex) const {
397 return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
398}
399
400template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
404
405template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
407 MultiVector(const Teuchos::RCP<const map_type>& map,
408 const size_t numVecs,
409 const bool zeroOut)
410 : /* default is true */
411 base_type(map) {
412 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,numVecs,zeroOut)");
413
414 const size_t lclNumRows = this->getLocalLength();
416}
417
418template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
421 const Teuchos::DataAccess copyOrView)
423 , view_(source.view_)
424 , whichVectors_(source.whichVectors_) {
426 const char tfecfFuncName[] =
427 "MultiVector(const MultiVector&, "
428 "const Teuchos::DataAccess): ";
429
430 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV 2-arg \"copy\" ctor");
431
432 if (copyOrView == Teuchos::Copy) {
433 // Reuse the conveniently already existing function that creates
434 // a deep copy.
435 MV cpy = createCopy(source);
436 this->view_ = cpy.view_;
437 this->whichVectors_ = cpy.whichVectors_;
438 } else if (copyOrView == Teuchos::View) {
439 } else {
441 true, std::invalid_argument,
442 "Second argument 'copyOrView' has an "
443 "invalid value "
444 << copyOrView << ". Valid values include "
445 "Teuchos::Copy = "
446 << Teuchos::Copy << " and Teuchos::View = "
447 << Teuchos::View << ".");
448 }
449}
450
451template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
453 MultiVector(const Teuchos::RCP<const map_type>& map,
454 const dual_view_type& view)
455 : base_type(map)
456 , view_(wrapped_dual_view_type(view)) {
457 const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
458 const size_t lclNumRows_map = map.is_null() ? size_t(0) : map->getLocalNumElements();
459 const size_t lclNumRows_view = view.extent(0);
460 const size_t LDA = getDualViewStride(view_);
461
463 std::invalid_argument,
464 "Kokkos::DualView does not match Map. "
465 "map->getLocalNumElements() = "
467 << ", view.extent(0) = " << lclNumRows_view
468 << ", and getStride() = " << LDA << ".");
469
470 using ::Tpetra::Details::Behavior;
471 const bool debug = Behavior::debug();
472 if (debug) {
473 using ::Tpetra::Details::checkGlobalDualViewValidity;
474 std::ostringstream gblErrStrm;
475 const bool verbose = Behavior::verbose();
476 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
477 const bool gblValid =
478 checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
479 comm.getRawPtr());
480 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
481 }
482}
483
484template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
486 MultiVector(const Teuchos::RCP<const map_type>& map,
487 const wrapped_dual_view_type& view)
488 : base_type(map)
489 , view_(view) {
490 const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
491 const size_t lclNumRows_map = map.is_null() ? size_t(0) : map->getLocalNumElements();
492 const size_t lclNumRows_view = view.extent(0);
493 const size_t LDA = getDualViewStride(view);
494
496 std::invalid_argument,
497 "Kokkos::DualView does not match Map. "
498 "map->getLocalNumElements() = "
500 << ", view.extent(0) = " << lclNumRows_view
501 << ", and getStride() = " << LDA << ".");
502
503 using ::Tpetra::Details::Behavior;
504 const bool debug = Behavior::debug();
505 if (debug) {
506 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
507 std::ostringstream gblErrStrm;
508 const bool verbose = Behavior::verbose();
509 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
510 const bool gblValid =
511 checkGlobalWrappedDualViewValidity(&gblErrStrm, view, verbose,
512 comm.getRawPtr());
513 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
514 }
515}
516
517template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
519 MultiVector(const Teuchos::RCP<const map_type>& map,
520 const typename dual_view_type::t_dev& d_view)
521 : base_type(map) {
522 using Teuchos::ArrayRCP;
523 using Teuchos::RCP;
524 const char tfecfFuncName[] = "MultiVector(map,d_view): ";
526 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,d_view)");
527
528 const size_t LDA = getViewStride(d_view);
529 const size_t lclNumRows = map->getLocalNumElements();
531 "Map does not match "
532 "Kokkos::View. map->getLocalNumElements() = "
533 << lclNumRows
534 << ", View's column stride = " << LDA
535 << ", and View's extent(0) = " << d_view.extent(0) << ".");
536
537 auto h_view = Kokkos::create_mirror_view(d_view);
540
541 using ::Tpetra::Details::Behavior;
542 const bool debug = Behavior::debug();
543 if (debug) {
544 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
545 std::ostringstream gblErrStrm;
546 const bool verbose = Behavior::verbose();
547 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
548 const bool gblValid =
549 checkGlobalWrappedDualViewValidity(&gblErrStrm, view_, verbose,
550 comm.getRawPtr());
551 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
552 }
553 // The user gave us a device view. In order to respect its
554 // initial contents, we mark the DualView as "modified on device."
555 // That way, the next sync will synchronize it with the host view.
556 dual_view.modify_device();
557}
558
559template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
561 MultiVector(const Teuchos::RCP<const map_type>& map,
562 const dual_view_type& view,
564 : base_type(map)
565 , view_(wrapped_dual_view_type(view, origView)) {
566 const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
567
568 const size_t LDA = getDualViewStride(origView);
569 const size_t lclNumRows = this->getLocalLength(); // comes from the Map
571 LDA < lclNumRows, std::invalid_argument,
572 "The input Kokkos::DualView's "
573 "column stride LDA = "
574 << LDA << " < getLocalLength() = " << lclNumRows
575 << ". This may also mean that the input origView's first dimension (number "
576 "of rows = "
577 << origView.extent(0) << ") does not not match the number "
578 "of entries in the Map on the calling process.");
579
580 using ::Tpetra::Details::Behavior;
581 const bool debug = Behavior::debug();
582 if (debug) {
583 using ::Tpetra::Details::checkGlobalDualViewValidity;
584 std::ostringstream gblErrStrm;
585 const bool verbose = Behavior::verbose();
586 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
587 const bool gblValid_0 =
588 checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
589 comm.getRawPtr());
590 const bool gblValid_1 =
591 checkGlobalDualViewValidity(&gblErrStrm, origView, verbose,
592 comm.getRawPtr());
593 const bool gblValid = gblValid_0 && gblValid_1;
594 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
595 }
596}
597
598template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
600 MultiVector(const Teuchos::RCP<const map_type>& map,
601 const dual_view_type& view,
602 const Teuchos::ArrayView<const size_t>& whichVectors)
604 , view_(view)
605 , whichVectors_(whichVectors.begin(), whichVectors.end()) {
606 using Kokkos::ALL;
607 using Kokkos::subview;
608 const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
609
610 using ::Tpetra::Details::Behavior;
611 const bool debug = Behavior::debug();
612 if (debug) {
613 using ::Tpetra::Details::checkGlobalDualViewValidity;
614 std::ostringstream gblErrStrm;
615 const bool verbose = Behavior::verbose();
616 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
617 const bool gblValid =
618 checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
619 comm.getRawPtr());
620 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
621 }
622
623 const size_t lclNumRows = map.is_null() ? size_t(0) : map->getLocalNumElements();
624 // Check dimensions of the input DualView. We accept that Kokkos
625 // might not allow construction of a 0 x m (Dual)View with m > 0,
626 // so we only require the number of rows to match if the
627 // (Dual)View has more than zero columns. Likewise, we only
628 // require the number of columns to match if the (Dual)View has
629 // more than zero rows.
631 view.extent(1) != 0 && static_cast<size_t>(view.extent(0)) < lclNumRows,
632 std::invalid_argument, "view.extent(0) = " << view.extent(0) << " < map->getLocalNumElements() = " << lclNumRows << ".");
633 if (whichVectors.size() != 0) {
635 view.extent(1) != 0 && view.extent(1) == 0,
636 std::invalid_argument,
637 "view.extent(1) = 0, but whichVectors.size()"
638 " = "
639 << whichVectors.size() << " > 0.");
640 size_t maxColInd = 0;
641 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
642 for (size_type k = 0; k < whichVectors.size(); ++k) {
644 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid(),
645 std::invalid_argument, "whichVectors[" << k << "] = "
646 "Teuchos::OrdinalTraits<size_t>::invalid().");
647 maxColInd = std::max(maxColInd, whichVectors[k]);
648 }
650 view.extent(1) != 0 && static_cast<size_t>(view.extent(1)) <= maxColInd,
651 std::invalid_argument, "view.extent(1) = " << view.extent(1) << " <= max(whichVectors) = " << maxColInd << ".");
652 }
654 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
655 // zero strides, so modify in that case.
656 const size_t LDA = getDualViewStride(view);
658 "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
659
660 if (whichVectors.size() == 1) {
661 // If whichVectors has only one entry, we don't need to bother
662 // with nonconstant stride. Just shift the view over so it
663 // points to the desired column.
664 //
665 // NOTE (mfh 10 May 2014) This is a special case where we set
666 // origView_ just to view that one column, not all of the
667 // original columns. This ensures that the use of origView_ in
668 // offsetView works correctly.
669 //
670 const std::pair<size_t, size_t> colRng(whichVectors[0],
671 whichVectors[0] + 1);
672 view_ = takeSubview(view_, ALL(), colRng);
673 // whichVectors_.size() == 0 means "constant stride."
674 whichVectors_.clear();
675 }
676}
677
678template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
680 MultiVector(const Teuchos::RCP<const map_type>& map,
681 const wrapped_dual_view_type& view,
682 const Teuchos::ArrayView<const size_t>& whichVectors)
683 : base_type(map)
684 , view_(view)
685 , whichVectors_(whichVectors.begin(), whichVectors.end()) {
686 using Kokkos::ALL;
687 using Kokkos::subview;
688 const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
689
690 using ::Tpetra::Details::Behavior;
691 const bool debug = Behavior::debug();
692 if (debug) {
693 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
694 std::ostringstream gblErrStrm;
695 const bool verbose = Behavior::verbose();
696 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
697 const bool gblValid =
698 checkGlobalWrappedDualViewValidity(&gblErrStrm, view, verbose,
699 comm.getRawPtr());
700 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
701 }
702
703 const size_t lclNumRows = map.is_null() ? size_t(0) : map->getLocalNumElements();
704 // Check dimensions of the input DualView. We accept that Kokkos
705 // might not allow construction of a 0 x m (Dual)View with m > 0,
706 // so we only require the number of rows to match if the
707 // (Dual)View has more than zero columns. Likewise, we only
708 // require the number of columns to match if the (Dual)View has
709 // more than zero rows.
711 view.extent(1) != 0 && static_cast<size_t>(view.extent(0)) < lclNumRows,
712 std::invalid_argument, "view.extent(0) = " << view.extent(0) << " < map->getLocalNumElements() = " << lclNumRows << ".");
713 if (whichVectors.size() != 0) {
715 view.extent(1) != 0 && view.extent(1) == 0,
716 std::invalid_argument,
717 "view.extent(1) = 0, but whichVectors.size()"
718 " = "
719 << whichVectors.size() << " > 0.");
720 size_t maxColInd = 0;
721 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
722 for (size_type k = 0; k < whichVectors.size(); ++k) {
724 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid(),
725 std::invalid_argument, "whichVectors[" << k << "] = "
726 "Teuchos::OrdinalTraits<size_t>::invalid().");
727 maxColInd = std::max(maxColInd, whichVectors[k]);
728 }
730 view.extent(1) != 0 && static_cast<size_t>(view.extent(1)) <= maxColInd,
731 std::invalid_argument, "view.extent(1) = " << view.extent(1) << " <= max(whichVectors) = " << maxColInd << ".");
732 }
733
734 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
735 // zero strides, so modify in that case.
736 const size_t LDA = getDualViewStride(view);
738 "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
739
740 if (whichVectors.size() == 1) {
741 // If whichVectors has only one entry, we don't need to bother
742 // with nonconstant stride. Just shift the view over so it
743 // points to the desired column.
744 //
745 // NOTE (mfh 10 May 2014) This is a special case where we set
746 // origView_ just to view that one column, not all of the
747 // original columns. This ensures that the use of origView_ in
748 // offsetView works correctly.
749 //
750 const std::pair<size_t, size_t> colRng(whichVectors[0],
751 whichVectors[0] + 1);
752 view_ = takeSubview(view_, ALL(), colRng);
753 // whichVectors_.size() == 0 means "constant stride."
754 whichVectors_.clear();
755 }
756}
757
758template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
760 MultiVector(const Teuchos::RCP<const map_type>& map,
761 const dual_view_type& view,
763 const Teuchos::ArrayView<const size_t>& whichVectors)
764 : base_type(map)
765 , view_(wrapped_dual_view_type(view, origView))
766 , whichVectors_(whichVectors.begin(), whichVectors.end()) {
767 using Kokkos::ALL;
768 using Kokkos::subview;
769 const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
770
771 using ::Tpetra::Details::Behavior;
772 const bool debug = Behavior::debug();
773 if (debug) {
774 using ::Tpetra::Details::checkGlobalDualViewValidity;
775 std::ostringstream gblErrStrm;
776 const bool verbose = Behavior::verbose();
777 const auto comm = map.is_null() ? Teuchos::null : map->getComm();
778 const bool gblValid_0 =
779 checkGlobalDualViewValidity(&gblErrStrm, view, verbose,
780 comm.getRawPtr());
781 const bool gblValid_1 =
782 checkGlobalDualViewValidity(&gblErrStrm, origView, verbose,
783 comm.getRawPtr());
784 const bool gblValid = gblValid_0 && gblValid_1;
785 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!gblValid, std::runtime_error, gblErrStrm.str());
786 }
787
788 const size_t lclNumRows = this->getLocalLength();
789 // Check dimensions of the input DualView. We accept that Kokkos
790 // might not allow construction of a 0 x m (Dual)View with m > 0,
791 // so we only require the number of rows to match if the
792 // (Dual)View has more than zero columns. Likewise, we only
793 // require the number of columns to match if the (Dual)View has
794 // more than zero rows.
796 view.extent(1) != 0 && static_cast<size_t>(view.extent(0)) < lclNumRows,
797 std::invalid_argument, "view.extent(0) = " << view.extent(0) << " < map->getLocalNumElements() = " << lclNumRows << ".");
798 if (whichVectors.size() != 0) {
800 view.extent(1) != 0 && view.extent(1) == 0,
801 std::invalid_argument,
802 "view.extent(1) = 0, but whichVectors.size()"
803 " = "
804 << whichVectors.size() << " > 0.");
805 size_t maxColInd = 0;
806 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
807 for (size_type k = 0; k < whichVectors.size(); ++k) {
809 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid(),
810 std::invalid_argument, "whichVectors[" << k << "] = "
811 "Teuchos::OrdinalTraits<size_t>::invalid().");
812 maxColInd = std::max(maxColInd, whichVectors[k]);
813 }
815 view.extent(1) != 0 && static_cast<size_t>(view.extent(1)) <= maxColInd,
816 std::invalid_argument, "view.extent(1) = " << view.extent(1) << " <= max(whichVectors) = " << maxColInd << ".");
817 }
818
819 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
820 // zero strides, so modify in that case.
821 const size_t LDA = getDualViewStride(origView);
823 "Map and DualView origView "
824 "do not match. LDA = "
825 << LDA << " < this->getLocalLength() = " << lclNumRows << ". origView.extent(0) = " << origView.extent(0)
826 << ", origView.stride(1) = " << origView.view_device().stride(1) << ".");
827
828 if (whichVectors.size() == 1) {
829 // If whichVectors has only one entry, we don't need to bother
830 // with nonconstant stride. Just shift the view over so it
831 // points to the desired column.
832 //
833 // NOTE (mfh 10 May 2014) This is a special case where we set
834 // origView_ just to view that one column, not all of the
835 // original columns. This ensures that the use of origView_ in
836 // offsetView works correctly.
837 const std::pair<size_t, size_t> colRng(whichVectors[0],
838 whichVectors[0] + 1);
840 // origView_ = takeSubview (origView_, ALL (), colRng);
841 // whichVectors_.size() == 0 means "constant stride."
842 whichVectors_.clear();
843 }
844}
845
846template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
848 MultiVector(const Teuchos::RCP<const map_type>& map,
849 const Teuchos::ArrayView<const Scalar>& data,
850 const size_t LDA,
851 const size_t numVecs)
852 : base_type(map) {
853 typedef LocalOrdinal LO;
854 typedef GlobalOrdinal GO;
855 const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
856 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
857
858 // Deep copy constructor, constant stride (NO whichVectors_).
859 // There is no need for a deep copy constructor with nonconstant stride.
860
861 const size_t lclNumRows =
862 map.is_null() ? size_t(0) : map->getLocalNumElements();
863 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
864 "map->getLocalNumElements() = "
865 << lclNumRows << ".");
866 if (numVecs != 0) {
867 const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
868 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(data.size()) < minNumEntries,
869 std::invalid_argument,
870 "Input Teuchos::ArrayView does not have enough "
871 "entries, given the input Map and number of vectors in the MultiVector."
872 " data.size() = "
873 << data.size() << " < (LDA*(numVecs-1)) + "
874 "map->getLocalNumElements () = "
875 << minNumEntries << ".");
876 }
877
879 // Note: X_out will be completely overwritten
880 auto X_out = this->getLocalViewDevice(Access::OverwriteAll);
881
882 // Make an unmanaged host Kokkos::View of the input data. First
883 // create a View (X_in_orig) with the original stride. Then,
884 // create a subview (X_in) with the right number of columns.
885 const impl_scalar_type* const X_in_raw =
886 reinterpret_cast<const impl_scalar_type*>(data.getRawPtr());
887 Kokkos::View<const impl_scalar_type**,
888 Kokkos::LayoutLeft,
889 Kokkos::HostSpace,
890 Kokkos::MemoryUnmanaged>
892 const Kokkos::pair<size_t, size_t> rowRng(0, lclNumRows);
893 auto X_in = Kokkos::subview(X_in_orig, rowRng, Kokkos::ALL());
894
895 // If LDA != X_out's column stride, then we need to copy one
896 // column at a time; Kokkos::deep_copy refuses to work in that
897 // case.
898 const size_t outStride =
899 X_out.extent(1) == 0 ? size_t(1) : X_out.stride(1);
900 if (LDA == outStride) { // strides are the same; deep_copy once
901 // This only works because MultiVector uses LayoutLeft.
902 // We would need a custom copy functor otherwise.
903 // DEEP_COPY REVIEW - HOST-TO-DEVICE
904 Kokkos::deep_copy(execution_space(), X_out, X_in);
905 } else { // strides differ; copy one column at a time
906 typedef decltype(Kokkos::subview(X_out, Kokkos::ALL(), 0))
908 typedef decltype(Kokkos::subview(X_in, Kokkos::ALL(), 0))
910 for (size_t j = 0; j < numVecs; ++j) {
911 out_col_view_type X_out_j = Kokkos::subview(X_out, Kokkos::ALL(), j);
912 in_col_view_type X_in_j = Kokkos::subview(X_in, Kokkos::ALL(), j);
913 // DEEP_COPY REVIEW - HOST-TO-DEVICE
914 Kokkos::deep_copy(execution_space(), X_out_j, X_in_j);
915 }
916 }
917}
919template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
921 MultiVector(const Teuchos::RCP<const map_type>& map,
922 const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
923 const size_t numVecs)
924 : base_type(map) {
925 typedef impl_scalar_type IST;
926 typedef LocalOrdinal LO;
927 typedef GlobalOrdinal GO;
928 const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
929 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
930
931 const size_t lclNumRows =
932 map.is_null() ? size_t(0) : map->getLocalNumElements();
933 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numVecs < 1 || numVecs != static_cast<size_t>(ArrayOfPtrs.size()),
934 std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
935 "ArrayOfPtrs.size() (= "
936 << ArrayOfPtrs.size() << ") != numVecs.");
937 for (size_t j = 0; j < numVecs; ++j) {
938 Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
940 static_cast<size_t>(X_j_av.size()) < lclNumRows,
941 std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = " << X_j_av.size() << " < map->getLocalNumElements() = " << lclNumRows << ".");
942 }
943
945 auto X_out = this->getLocalViewDevice(Access::ReadWrite);
946
947 // Make sure that the type of a single input column has the same
948 // array layout as each output column does, so we can deep_copy.
949 using array_layout = typename decltype(X_out)::array_layout;
950 using input_col_view_type = typename Kokkos::View<const IST*,
951 array_layout,
952 Kokkos::HostSpace,
953 Kokkos::MemoryUnmanaged>;
954
955 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
956 for (size_t j = 0; j < numVecs; ++j) {
957 Teuchos::ArrayView<const IST> X_j_av =
958 Teuchos::av_reinterpret_cast<const IST>(ArrayOfPtrs[j]);
960 auto X_j_out = Kokkos::subview(X_out, rowRng, j);
961 // DEEP_COPY REVIEW - HOST-TO-DEVICE
962 Kokkos::deep_copy(execution_space(), X_j_out, X_j_in);
963 }
964}
965
966template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
968 isConstantStride() const {
969 return whichVectors_.empty();
970}
971
972template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
973size_t
975 getLocalLength() const {
976 if (this->getMap().is_null()) { // possible, due to replaceMap().
977 return static_cast<size_t>(0);
978 } else {
979 return this->getMap()->getLocalNumElements();
980 }
981}
982
983template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
986 getGlobalLength() const {
987 if (this->getMap().is_null()) { // possible, due to replaceMap().
988 return static_cast<size_t>(0);
989 } else {
990 return this->getMap()->getGlobalNumElements();
991 }
992}
993
994template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
995size_t
997 getStride() const {
998 return isConstantStride() ? getDualViewStride(view_) : size_t(0);
999}
1000
1001template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1004 // Don't actually get a view, just get pointers.
1005 auto thisData = view_.getDualView().view_host().data();
1006 auto otherData = other.view_.getDualView().view_host().data();
1007 size_t thisSpan = view_.getDualView().view_host().span();
1008 size_t otherSpan = other.view_.getDualView().view_host().span();
1010}
1011
1012template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1015 // Check whether the source object is a MultiVector. If not, then
1016 // we can't even compare sizes, so it's definitely not OK to
1017 // Import or Export from it.
1019 const MV* src = dynamic_cast<const MV*>(&sourceObj);
1020 if (src == nullptr) {
1021 return false;
1022 } else {
1023 // The target of the Import or Export calls checkSizes() in
1024 // DistObject::doTransfer(). By that point, we've already
1025 // constructed an Import or Export object using the two
1026 // multivectors' Maps, which means that (hopefully) we've
1027 // already checked other attributes of the multivectors. Thus,
1028 // all we need to do here is check the number of columns.
1029 return src->getNumVectors() == this->getNumVectors();
1030 }
1031}
1032
1033template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1034size_t
1037 return this->getNumVectors();
1038}
1039
1040template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1043 const size_t numSameIDs,
1044 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1045 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1046 const CombineMode CM,
1047 const execution_space& space) {
1048 using Kokkos::Compat::create_const_view;
1049 using KokkosRefactor::Details::permute_array_multi_column;
1050 using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
1051 using std::endl;
1052 using ::Tpetra::Details::Behavior;
1053 using ::Tpetra::Details::getDualViewCopyFromArrayView;
1054 using ::Tpetra::Details::ProfilingRegion;
1056
1057 // We've already called checkSizes(), so this cast must succeed.
1058 MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&>(sourceObj));
1059
1061 if (importsAreAliased() && (this->constantNumberOfPackets() != 0) && Behavior::enableGranularTransfers()) {
1062 // imports are aliased to the target. We already posted Irecvs
1063 // into imports using a memory space that depends on GPU
1064 // awareness. Therefore we want to modify target in the same
1065 // memory space. See comments in DistObject::beginTransfer.
1066 copyOnHost = !Behavior::assumeMpiIsGPUAware();
1067 } else {
1068 // We are free to choose the better memory space for copyAndPermute.
1070 }
1071 const char longFuncNameHost[] = "Tpetra::MultiVector::copyAndPermute[Host]";
1072 const char longFuncNameDevice[] = "Tpetra::MultiVector::copyAndPermute[Device]";
1073 const char tfecfFuncName[] = "copyAndPermute: ";
1075
1076 const bool verbose = Behavior::verbose();
1077 std::unique_ptr<std::string> prefix;
1078 if (verbose) {
1079 auto map = this->getMap();
1080 auto comm = map.is_null() ? Teuchos::null : map->getComm();
1081 const int myRank = comm.is_null() ? -1 : comm->getRank();
1082 std::ostringstream os;
1083 os << "Proc " << myRank << ": MV::copyAndPermute: ";
1084 prefix = std::unique_ptr<std::string>(new std::string(os.str()));
1085 os << "Start" << endl;
1086 std::cerr << os.str();
1087 }
1088
1090 std::logic_error, "permuteToLIDs.extent(0) = " << permuteToLIDs.extent(0) << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0) << ".");
1091 const size_t numCols = this->getNumVectors();
1092
1093 // sourceMV doesn't belong to us, so we can't sync it. Do the
1094 // copying where it's currently sync'd.
1095 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(sourceMV.need_sync_device() && sourceMV.need_sync_host(),
1096 std::logic_error,
1097 "Input MultiVector needs sync to both host "
1098 "and device.");
1099 if (verbose) {
1100 std::ostringstream os;
1101 os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1102 std::cerr << os.str();
1103 }
1104
1105 if (verbose) {
1106 std::ostringstream os;
1107 os << *prefix << "Copy first " << numSameIDs << " elements" << endl;
1108 std::cerr << os.str();
1109 }
1110
1111 // TODO (mfh 15 Sep 2013) When we replace
1112 // KokkosClassic::MultiVector with a Kokkos::View, there are two
1113 // ways to copy the data:
1114 //
1115 // 1. Get a (sub)view of each column and call deep_copy on that.
1116 // 2. Write a custom kernel to copy the data.
1117 //
1118 // The first is easier, but the second might be more performant in
1119 // case we decide to use layouts other than LayoutLeft. It might
1120 // even make sense to hide whichVectors_ in an entirely new layout
1121 // for Kokkos Views.
1122
1123 // Copy rows [0, numSameIDs-1] of the local multivectors.
1124 //
1125 // Note (ETP 2 Jul 2014) We need to always copy one column at a
1126 // time, even when both multivectors are constant-stride, since
1127 // deep_copy between strided subviews with more than one column
1128 // doesn't currently work.
1130 // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1131 // both source and target are constant stride and have multiple
1132 // columns.
1133 if (numSameIDs > 0) {
1134 const std::pair<size_t, size_t> rows(0, numSameIDs);
1135 if (copyOnHost) {
1136 ProfilingRegion regionC("Tpetra::MultiVector::copy[Host]");
1137 auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1138 auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1139
1140 for (size_t j = 0; j < numCols; ++j) {
1141 const size_t tgtCol = isConstantStride() ? j : whichVectors_[j];
1142 const size_t srcCol =
1143 sourceMV.isConstantStride() ? j : sourceMV.whichVectors_[j];
1145 auto tgt_j = Kokkos::subview(tgt_h, rows, tgtCol);
1146 auto src_j = Kokkos::subview(src_h, rows, srcCol);
1147 if (CM == ADD_ASSIGN) {
1148 // Sum src_j into tgt_j
1149 using range_t =
1150 Kokkos::RangePolicy<execution_space, size_t>;
1151 range_t rp(space, 0, numSameIDs);
1152 Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1153 aaf(tgt_j, src_j);
1154 Kokkos::parallel_for(rp, aaf);
1155 } else {
1156 // Copy src_j into tgt_j
1157 // DEEP_COPY REVIEW - HOSTMIRROR-TO-HOSTMIRROR
1158 Kokkos::deep_copy(space, tgt_j, src_j);
1159 space.fence("Tpetra::MultiVector::copyAndPermute-1");
1160 }
1161 }
1162 } else { // copy on device
1163 ProfilingRegion regionC("Tpetra::MultiVector::copy[Device]");
1164 auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1165 auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1166
1167 for (size_t j = 0; j < numCols; ++j) {
1168 const size_t tgtCol = isConstantStride() ? j : whichVectors_[j];
1169 const size_t srcCol =
1170 sourceMV.isConstantStride() ? j : sourceMV.whichVectors_[j];
1171
1172 auto tgt_j = Kokkos::subview(tgt_d, rows, tgtCol);
1173 auto src_j = Kokkos::subview(src_d, rows, srcCol);
1174 if (CM == ADD_ASSIGN) {
1175 // Sum src_j into tgt_j
1176 using range_t =
1177 Kokkos::RangePolicy<execution_space, size_t>;
1178 range_t rp(space, 0, numSameIDs);
1179 Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1180 aaf(tgt_j, src_j);
1181 Kokkos::parallel_for(rp, aaf);
1182 } else {
1183 // Copy src_j into tgt_j
1184 // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
1185 Kokkos::deep_copy(space, tgt_j, src_j);
1186 space.fence("Tpetra::MultiVector::copyAndPermute-2");
1187 }
1188 }
1189 }
1190 }
1191
1192 // For the remaining GIDs, execute the permutations. This may
1193 // involve noncontiguous access of both source and destination
1194 // vectors, depending on the LID lists.
1195 //
1196 // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1197 // the same process, this merges their values by replacement of
1198 // the last encountered GID, not by the specified merge rule
1199 // (such as ADD).
1200
1201 // If there are no permutations, we are done
1202 if (permuteFromLIDs.extent(0) == 0 ||
1203 permuteToLIDs.extent(0) == 0) {
1204 if (verbose) {
1205 std::ostringstream os;
1206 os << *prefix << "No permutations. Done!" << endl;
1207 std::cerr << os.str();
1208 }
1209 return;
1210 }
1211 ProfilingRegion regionP("Tpetra::MultiVector::permute");
1212
1213 if (verbose) {
1214 std::ostringstream os;
1215 os << *prefix << "Permute" << endl;
1216 std::cerr << os.str();
1217 }
1218
1219 // We could in theory optimize for the case where exactly one of
1220 // them is constant stride, but we don't currently do that.
1221 const bool nonConstStride =
1222 !this->isConstantStride() || !sourceMV.isConstantStride();
1223
1224 if (verbose) {
1225 std::ostringstream os;
1226 os << *prefix << "nonConstStride="
1227 << (nonConstStride ? "true" : "false") << endl;
1228 std::cerr << os.str();
1229 }
1230
1231 // We only need the "which vectors" arrays if either the source or
1232 // target MV is not constant stride. Since we only have one
1233 // kernel that must do double-duty, we have to create a "which
1234 // vectors" array for the MV that _is_ constant stride.
1235 Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1236 Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1237 if (nonConstStride) {
1238 if (this->whichVectors_.size() == 0) {
1239 Kokkos::DualView<size_t*, device_type> tmpTgt("tgtWhichVecs", numCols);
1240 tmpTgt.modify_host();
1241 for (size_t j = 0; j < numCols; ++j) {
1242 tmpTgt.view_host()(j) = j;
1243 }
1244 if (!copyOnHost) {
1245 tmpTgt.sync_device();
1246 }
1248 } else {
1249 Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_();
1250 tgtWhichVecs =
1252 "tgtWhichVecs",
1253 copyOnHost);
1254 }
1255 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(tgtWhichVecs.extent(0)) !=
1256 this->getNumVectors(),
1257 std::logic_error, "tgtWhichVecs.extent(0) = " << tgtWhichVecs.extent(0) << " != this->getNumVectors() = " << this->getNumVectors() << ".");
1258
1259 if (sourceMV.whichVectors_.size() == 0) {
1260 Kokkos::DualView<size_t*, device_type> tmpSrc("srcWhichVecs", numCols);
1261 tmpSrc.modify_host();
1262 for (size_t j = 0; j < numCols; ++j) {
1263 tmpSrc.view_host()(j) = j;
1265 if (!copyOnHost) {
1266 tmpSrc.sync_device();
1267 }
1269 } else {
1270 Teuchos::ArrayView<const size_t> srcWhichVecsT =
1271 sourceMV.whichVectors_();
1274 "srcWhichVecs",
1275 copyOnHost);
1276 }
1277 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(srcWhichVecs.extent(0)) !=
1278 sourceMV.getNumVectors(),
1279 std::logic_error,
1280 "srcWhichVecs.extent(0) = " << srcWhichVecs.extent(0)
1281 << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors()
1282 << ".");
1283 }
1284
1285 if (copyOnHost) { // permute on host too
1286 if (verbose) {
1287 std::ostringstream os;
1288 os << *prefix << "Get permute LIDs on host" << std::endl;
1289 std::cerr << os.str();
1290 }
1291 auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1292 auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1293
1294 TEUCHOS_ASSERT(!permuteToLIDs.need_sync_host());
1296 TEUCHOS_ASSERT(!permuteFromLIDs.need_sync_host());
1297 auto permuteFromLIDs_h =
1299
1300 if (verbose) {
1301 std::ostringstream os;
1302 os << *prefix << "Permute on host" << endl;
1303 std::cerr << os.str();
1304 }
1305 if (nonConstStride) {
1306 // No need to sync first, because copyOnHost argument to
1307 // getDualViewCopyFromArrayView puts them in the right place.
1308 auto tgtWhichVecs_h =
1309 create_const_view(tgtWhichVecs.view_host());
1310 auto srcWhichVecs_h =
1311 create_const_view(srcWhichVecs.view_host());
1312 if (CM == ADD_ASSIGN) {
1313 using op_type = KokkosRefactor::Details::AddOp;
1314 permute_array_multi_column_variable_stride(tgt_h, src_h,
1318 srcWhichVecs_h, numCols,
1319 op_type());
1320 } else {
1321 using op_type = KokkosRefactor::Details::InsertOp;
1322 permute_array_multi_column_variable_stride(tgt_h, src_h,
1326 srcWhichVecs_h, numCols,
1327 op_type());
1328 }
1329 } else {
1330 if (CM == ADD_ASSIGN) {
1331 using op_type = KokkosRefactor::Details::AddOp;
1332 permute_array_multi_column(tgt_h, src_h, permuteToLIDs_h,
1333 permuteFromLIDs_h, numCols, op_type());
1334 } else {
1335 using op_type = KokkosRefactor::Details::InsertOp;
1336 permute_array_multi_column(tgt_h, src_h, permuteToLIDs_h,
1337 permuteFromLIDs_h, numCols, op_type());
1338 }
1339 }
1340 } else { // permute on device
1341 if (verbose) {
1342 std::ostringstream os;
1343 os << *prefix << "Get permute LIDs on device" << endl;
1344 std::cerr << os.str();
1345 }
1346 auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1347 auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1348
1349 TEUCHOS_ASSERT(!permuteToLIDs.need_sync_device());
1350 auto permuteToLIDs_d = create_const_view(permuteToLIDs.view_device());
1351 TEUCHOS_ASSERT(!permuteFromLIDs.need_sync_device());
1352 auto permuteFromLIDs_d =
1353 create_const_view(permuteFromLIDs.view_device());
1354
1355 if (verbose) {
1356 std::ostringstream os;
1357 os << *prefix << "Permute on device" << endl;
1358 std::cerr << os.str();
1359 }
1360 if (nonConstStride) {
1361 // No need to sync first, because copyOnHost argument to
1362 // getDualViewCopyFromArrayView puts them in the right place.
1364 auto srcWhichVecs_d = create_const_view(srcWhichVecs.view_device());
1365 if (CM == ADD_ASSIGN) {
1366 using op_type = KokkosRefactor::Details::AddOp;
1367 permute_array_multi_column_variable_stride(space, tgt_d, src_d,
1371 srcWhichVecs_d, numCols,
1372 op_type());
1373 } else {
1374 using op_type = KokkosRefactor::Details::InsertOp;
1375 permute_array_multi_column_variable_stride(space, tgt_d, src_d,
1379 srcWhichVecs_d, numCols,
1380 op_type());
1381 }
1382 } else {
1383 if (CM == ADD_ASSIGN) {
1384 using op_type = KokkosRefactor::Details::AddOp;
1385 permute_array_multi_column(space, tgt_d, src_d, permuteToLIDs_d,
1386 permuteFromLIDs_d, numCols, op_type());
1387 } else {
1388 using op_type = KokkosRefactor::Details::InsertOp;
1389 permute_array_multi_column(space, tgt_d, src_d, permuteToLIDs_d,
1390 permuteFromLIDs_d, numCols, op_type());
1391 }
1392 }
1394
1395 if (verbose) {
1396 std::ostringstream os;
1397 os << *prefix << "Done!" << endl;
1398 std::cerr << os.str();
1399 }
1401
1402template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1404 const SrcDistObject& sourceObj, const size_t numSameIDs,
1405 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1406 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1407 const CombineMode CM) {
1408 copyAndPermute(sourceObj, numSameIDs, permuteToLIDs, permuteFromLIDs, CM,
1409 execution_space());
1411
1412template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1415 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1416 Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1417 Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1418 size_t& constantNumPackets,
1419 const execution_space& space) {
1420 using Kokkos::Compat::create_const_view;
1421 using Kokkos::Compat::getKokkosViewDeepCopy;
1422 using std::endl;
1423 using ::Tpetra::Details::Behavior;
1424 using ::Tpetra::Details::ProfilingRegion;
1425 using ::Tpetra::Details::reallocDualViewIfNeeded;
1427
1428 // We've already called checkSizes(), so this cast must succeed.
1429 MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&>(sourceObj));
1430
1431 const bool packOnHost = runKernelOnHost(sourceMV);
1432 const char longFuncNameHost[] = "Tpetra::MultiVector::packAndPrepare[Host]";
1433 const char longFuncNameDevice[] = "Tpetra::MultiVector::packAndPrepare[Device]";
1434 const char tfecfFuncName[] = "packAndPrepare: ";
1436
1437 // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1438 // have the option to check indices. We do so when Tpetra is in
1439 // debug mode. It is in debug mode by default in a debug build,
1440 // but you may control this at run time, before launching the
1441 // executable, by setting the TPETRA_DEBUG environment variable to
1442 // "1" (or "TRUE").
1443 const bool debugCheckIndices = Behavior::debug();
1444 // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1445 // environment variable to "1" (or "TRUE") for copious debug
1446 // output to std::cerr on every MPI process. This is unwise for
1447 // runs with large numbers of MPI processes.
1448 const bool printDebugOutput = Behavior::verbose();
1449
1450 std::unique_ptr<std::string> prefix;
1451 if (printDebugOutput) {
1452 auto map = this->getMap();
1453 auto comm = map.is_null() ? Teuchos::null : map->getComm();
1454 const int myRank = comm.is_null() ? -1 : comm->getRank();
1455 std::ostringstream os;
1456 os << "Proc " << myRank << ": MV::packAndPrepare: ";
1457 prefix = std::unique_ptr<std::string>(new std::string(os.str()));
1458 os << "Start" << endl;
1459 std::cerr << os.str();
1460 }
1461
1462 const size_t numCols = sourceMV.getNumVectors();
1463
1464 // This spares us from needing to fill numExportPacketsPerLID.
1465 // Setting constantNumPackets to a nonzero value signals that
1466 // all packets have the same number of entries.
1467 constantNumPackets = numCols;
1468
1469 // If we have no exports, there is nothing to do. Make sure this
1470 // goes _after_ setting constantNumPackets correctly.
1471 if (exportLIDs.extent(0) == 0) {
1472 if (printDebugOutput) {
1473 std::ostringstream os;
1474 os << *prefix << "No exports on this proc, DONE" << std::endl;
1475 std::cerr << os.str();
1476 }
1477 return;
1478 }
1479
1480 /* The layout in the export for MultiVectors is as follows:
1481 exports = { all of the data from row exportLIDs.front() ;
1482 ....
1483 all of the data from row exportLIDs.back() }
1484 This doesn't have the best locality, but is necessary because
1485 the data for a Packet (all data associated with an LID) is
1486 required to be contiguous. */
1487
1488 // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1489 // packing scheme in the above comment? The data going to a
1490 // particular process must be contiguous, of course, but those
1491 // data could include entries from multiple LIDs. DistObject just
1492 // needs to know how to index into that data. Kokkos is good at
1493 // decoupling storage intent from data layout choice.
1494
1495 const size_t numExportLIDs = exportLIDs.extent(0);
1496 const size_t newExportsSize = numCols * numExportLIDs;
1497 if (printDebugOutput) {
1498 std::ostringstream os;
1499 os << *prefix << "realloc: "
1500 << "numExportLIDs: " << numExportLIDs
1501 << ", exports.extent(0): " << exports.extent(0)
1502 << ", newExportsSize: " << newExportsSize << std::endl;
1503 std::cerr << os.str();
1504 }
1505 reallocDualViewIfNeeded(exports, newExportsSize, "exports");
1506
1507 // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1508 // sync it. Pack it where it's currently sync'd.
1509 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(sourceMV.need_sync_device() && sourceMV.need_sync_host(),
1510 std::logic_error,
1511 "Input MultiVector needs sync to both host "
1512 "and device.");
1513 if (printDebugOutput) {
1514 std::ostringstream os;
1515 os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1516 std::cerr << os.str();
1517 }
1518
1519 // Mark 'exports' here, since we might have resized it above.
1520 // Resizing currently requires calling the constructor, which
1521 // clears out the 'modified' flags.
1522 if (packOnHost) {
1523 // nde 06 Feb 2020: If 'exports' does not require resize
1524 // when reallocDualViewIfNeeded is called, the modified flags
1525 // are not cleared out. This can result in host and device views
1526 // being out-of-sync, resuling in an error in exports.modify_* calls.
1527 // Clearing the sync flags prevents this possible case.
1528 exports.clear_sync_state();
1529 exports.modify_host();
1530 } else {
1531 // nde 06 Feb 2020: If 'exports' does not require resize
1532 // when reallocDualViewIfNeeded is called, the modified flags
1533 // are not cleared out. This can result in host and device views
1534 // being out-of-sync, resuling in an error in exports.modify_* calls.
1535 // Clearing the sync flags prevents this possible case.
1536 exports.clear_sync_state();
1537 exports.modify_device();
1538 }
1539
1540 if (numCols == 1) { // special case for one column only
1541 // MultiVector always represents a single column with constant
1542 // stride, but it doesn't hurt to implement both cases anyway.
1543 //
1544 // ETP: I'm not sure I agree with the above statement. Can't a single-
1545 // column multivector be a subview of another multi-vector, in which case
1546 // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1547 // separately.
1548 //
1549 // mfh 18 Jan 2016: In answer to ETP's comment above:
1550 // MultiVector treats single-column MultiVectors created using a
1551 // "nonconstant stride constructor" as a special case, and makes
1552 // them constant stride (by making whichVectors_ have length 0).
1553 if (sourceMV.isConstantStride()) {
1554 using KokkosRefactor::Details::pack_array_single_column;
1555 if (printDebugOutput) {
1556 std::ostringstream os;
1557 os << *prefix << "Pack numCols=1 const stride" << endl;
1558 std::cerr << os.str();
1559 }
1560 if (packOnHost) {
1561 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1562 pack_array_single_column(exports.view_host(),
1563 src_host,
1564 exportLIDs.view_host(),
1565 0,
1567 } else { // pack on device
1568 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1569 pack_array_single_column(exports.view_device(),
1570 src_dev,
1571 exportLIDs.view_device(),
1572 0,
1574 space);
1575 }
1576 } else {
1577 using KokkosRefactor::Details::pack_array_single_column;
1578 if (printDebugOutput) {
1579 std::ostringstream os;
1580 os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1581 std::cerr << os.str();
1582 }
1583 if (packOnHost) {
1584 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1585 pack_array_single_column(exports.view_host(),
1586 src_host,
1587 exportLIDs.view_host(),
1588 sourceMV.whichVectors_[0],
1590 } else { // pack on device
1591 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1592 pack_array_single_column(exports.view_device(),
1593 src_dev,
1594 exportLIDs.view_device(),
1595 sourceMV.whichVectors_[0],
1597 space);
1598 }
1599 }
1600 } else { // the source MultiVector has multiple columns
1601 if (sourceMV.isConstantStride()) {
1602 using KokkosRefactor::Details::pack_array_multi_column;
1603 if (printDebugOutput) {
1604 std::ostringstream os;
1605 os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1606 std::cerr << os.str();
1607 }
1608 if (packOnHost) {
1609 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1610 pack_array_multi_column(exports.view_host(),
1611 src_host,
1612 exportLIDs.view_host(),
1613 numCols,
1615 } else { // pack on device
1616 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1617 pack_array_multi_column(exports.view_device(),
1618 src_dev,
1619 exportLIDs.view_device(),
1620 numCols,
1622 space);
1623 }
1624 } else {
1625 using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1626 if (printDebugOutput) {
1627 std::ostringstream os;
1628 os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1629 << endl;
1630 std::cerr << os.str();
1631 }
1632 // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1633 // whichVectors_ can be expensive, but pack and unpack for
1634 // nonconstant-stride MultiVectors is slower anyway.
1635 using IST = impl_scalar_type;
1636 using DV = Kokkos::DualView<IST*, device_type>;
1637 using HES = typename DV::t_host::execution_space;
1638 using DES = typename DV::t_dev::execution_space;
1639 Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_();
1640 if (packOnHost) {
1641 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1642 pack_array_multi_column_variable_stride(exports.view_host(),
1643 src_host,
1644 exportLIDs.view_host(),
1646 numCols,
1648 } else { // pack on device
1649 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1650 pack_array_multi_column_variable_stride(exports.view_device(),
1651 src_dev,
1652 exportLIDs.view_device(),
1654 numCols,
1655 debugCheckIndices, space);
1656 }
1657 }
1658 }
1660 if (printDebugOutput) {
1661 std::ostringstream os;
1662 os << *prefix << "Done!" << endl;
1663 std::cerr << os.str();
1664 }
1665}
1666
1667template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1670 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1671 Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1672 Kokkos::DualView<size_t*, buffer_device_type> numExportPacketsPerLID,
1673 size_t& constantNumPackets) {
1675}
1676
1677template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1678template <class NO>
1679typename std::enable_if<std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1680 typename NO::device_type::memory_space>::value,
1681 bool>::type
1684 const bool verbose,
1685 const std::string* prefix,
1686 const bool areRemoteLIDsContiguous,
1687 const CombineMode CM) {
1688 // This implementation of reallocImportsIfNeeded is an
1689 // optimization that is specific to MultiVector. We check if the
1690 // imports_ view can be aliased to the end of the data view_. If
1691 // that is the case, we can skip the unpackAndCombine call.
1692
1693 if (verbose) {
1694 std::ostringstream os;
1695 os << *prefix << "Realloc (if needed) imports_ from "
1696 << this->imports_.extent(0) << " to " << newSize << std::endl;
1697 std::cerr << os.str();
1698 }
1699
1700 bool reallocated = false;
1701
1702 using IST = impl_scalar_type;
1703 using DV = Kokkos::DualView<IST*, Kokkos::LayoutLeft, buffer_device_type>;
1704
1705 // Conditions for aliasing memory:
1706 // - When both sides of the dual view are in the same memory
1707 // space, we do not need to worry about syncing things.
1708 // - When both memory spaces are different, we only alias if this
1709 // does not incur additional sync'ing.
1710 // - The remote LIDs need to be contiguous, so that we do not need
1711 // to reorder received information.
1712 // - CombineMode needs to be INSERT.
1713 // - The number of vectors needs to be 1, otherwise we need to
1714 // reorder the received data.
1715 if ((std::is_same_v<typename dual_view_type::t_dev::device_type, typename dual_view_type::t_host::device_type> ||
1716 (Details::Behavior::assumeMpiIsGPUAware() && !this->need_sync_device()) ||
1717 (!Details::Behavior::assumeMpiIsGPUAware() && !this->need_sync_host())) &&
1718 areRemoteLIDsContiguous &&
1719 (CM == INSERT || CM == REPLACE) &&
1720 (getNumVectors() == 1) &&
1721 (newSize > 0)) {
1722 size_t offset = getLocalLength() - newSize;
1723 reallocated = this->imports_.view_device().data() != view_.getDualView().view_device().data() + offset;
1724 if (reallocated) {
1725 typedef std::pair<size_t, size_t> range_type;
1726 this->imports_ = DV(view_.getDualView(),
1727 range_type(offset, getLocalLength()),
1728 0);
1729
1730 if (verbose) {
1731 std::ostringstream os;
1732 os << *prefix << "Aliased imports_ to MV.view_" << std::endl;
1733 std::cerr << os.str();
1734 }
1735 }
1736 return reallocated;
1737 }
1738 {
1739 using ::Tpetra::Details::reallocDualViewIfNeeded;
1740 reallocated =
1741 reallocDualViewIfNeeded(this->unaliased_imports_, newSize, "imports");
1742 if (verbose) {
1743 std::ostringstream os;
1744 os << *prefix << "Finished realloc'ing unaliased_imports_" << std::endl;
1745 std::cerr << os.str();
1746 }
1747 this->imports_ = this->unaliased_imports_;
1748 }
1749 return reallocated;
1750}
1751
1752template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1753template <class NO>
1754typename std::enable_if<!std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1755 typename NO::device_type::memory_space>::value,
1756 bool>::type
1759 const bool verbose,
1760 const std::string* prefix,
1761 const bool areRemoteLIDsContiguous,
1762 const CombineMode CM) {
1764}
1765
1766template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1768 reallocImportsIfNeeded(const size_t newSize,
1769 const bool verbose,
1770 const std::string* prefix,
1771 const bool areRemoteLIDsContiguous,
1772 const CombineMode CM) {
1774 return reallocImportsIfNeededImpl<Node>(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1775}
1776
1777template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1780 return (this->imports_.view_device().data() + this->imports_.view_device().extent(0) ==
1781 view_.getDualView().view_device().data() + view_.getDualView().view_device().extent(0));
1782}
1783
1784template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1786 unpackAndCombine(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1787 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1788 Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1789 const size_t constantNumPackets,
1790 const CombineMode CM,
1791 const execution_space& space) {
1792 using Kokkos::Compat::getKokkosViewDeepCopy;
1793 using KokkosRefactor::Details::unpack_array_multi_column;
1794 using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1795 using std::endl;
1796 using ::Tpetra::Details::Behavior;
1797 using ::Tpetra::Details::ProfilingRegion;
1798
1799 const bool unpackOnHost = runKernelOnHost(imports);
1800
1801 const char longFuncNameHost[] = "Tpetra::MultiVector::unpackAndCombine[Host]";
1802 const char longFuncNameDevice[] = "Tpetra::MultiVector::unpackAndCombine[Device]";
1804 const char tfecfFuncName[] = "unpackAndCombine: ";
1805
1806 // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1807 // have the option to check indices. We do so when Tpetra is in
1808 // debug mode. It is in debug mode by default in a debug build,
1809 // but you may control this at run time, before launching the
1810 // executable, by setting the TPETRA_DEBUG environment variable to
1811 // "1" (or "TRUE").
1812 const bool debugCheckIndices = Behavior::debug();
1813
1814 const bool printDebugOutput = Behavior::verbose();
1815 std::unique_ptr<std::string> prefix;
1816 if (printDebugOutput) {
1817 auto map = this->getMap();
1818 auto comm = map.is_null() ? Teuchos::null : map->getComm();
1819 const int myRank = comm.is_null() ? -1 : comm->getRank();
1820 std::ostringstream os;
1821 os << "Proc " << myRank << ": " << longFuncName << ": ";
1822 prefix = std::unique_ptr<std::string>(new std::string(os.str()));
1823 os << "Start" << endl;
1824 std::cerr << os.str();
1825 }
1826
1827 // If we have no imports, there is nothing to do
1828 if (importLIDs.extent(0) == 0) {
1829 if (printDebugOutput) {
1830 std::ostringstream os;
1831 os << *prefix << "No imports. Done!" << endl;
1832 std::cerr << os.str();
1833 }
1834 return;
1835 }
1836
1837 // Check, whether imports_ is a subview of the MV view.
1838 if (importsAreAliased()) {
1839 if (printDebugOutput) {
1840 std::ostringstream os;
1841 os << *prefix << "Skipping unpack, since imports_ is aliased to MV.view_. Done!" << endl;
1842 std::cerr << os.str();
1843 }
1844 return;
1845 }
1846
1847 ProfilingRegion regionUAC(longFuncName);
1848
1849 const size_t numVecs = getNumVectors();
1850 if (debugCheckIndices) {
1851 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(imports.extent(0)) !=
1852 numVecs * importLIDs.extent(0),
1853 std::runtime_error,
1854 "imports.extent(0) = " << imports.extent(0)
1855 << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1856 << " * " << importLIDs.extent(0) << " = "
1857 << numVecs * importLIDs.extent(0) << ".");
1858
1859 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(constantNumPackets == static_cast<size_t>(0), std::runtime_error,
1860 "constantNumPackets input argument must be nonzero.");
1861
1862 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(numVecs) !=
1863 static_cast<size_t>(constantNumPackets),
1864 std::runtime_error, "constantNumPackets must equal numVecs.");
1865 }
1866
1867 // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1868 // the memory space in which the imports buffer was last modified and
1869 // the size of the imports buffer.
1870 // DistObject::doTransferNew decides where it was last modified (based on
1871 // whether communication buffers used were on host or device).
1872 if (unpackOnHost) {
1873 if (this->imports_.need_sync_host()) this->imports_.sync_host();
1874 } else {
1875 if (this->imports_.need_sync_device()) this->imports_.sync_device();
1876 }
1877
1878 if (printDebugOutput) {
1879 std::ostringstream os;
1880 os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1881 << endl;
1882 std::cerr << os.str();
1883 }
1885 // We have to sync before modifying, because this method may read
1886 // as well as write (depending on the CombineMode).
1887 auto imports_d = imports.view_device();
1888 auto imports_h = imports.view_host();
1889 auto importLIDs_d = importLIDs.view_device();
1890 auto importLIDs_h = importLIDs.view_host();
1891
1892 Kokkos::DualView<size_t*, device_type> whichVecs;
1893 if (!isConstantStride()) {
1894 Kokkos::View<const size_t*, Kokkos::HostSpace,
1895 Kokkos::MemoryUnmanaged>
1896 whichVecsIn(whichVectors_.getRawPtr(),
1897 numVecs);
1898 whichVecs = Kokkos::DualView<size_t*, device_type>("whichVecs", numVecs);
1899 if (unpackOnHost) {
1900 whichVecs.modify_host();
1901 // DEEP_COPY REVIEW - NOT TESTED FOR CUDA BUILD
1902 Kokkos::deep_copy(whichVecs.view_host(), whichVecsIn);
1903 } else {
1904 whichVecs.modify_device();
1905 // DEEP_COPY REVIEW - HOST-TO-DEVICE
1906 Kokkos::deep_copy(whichVecs.view_device(), whichVecsIn);
1907 }
1908 }
1909 auto whichVecs_d = whichVecs.view_device();
1910 auto whichVecs_h = whichVecs.view_host();
1911
1912 /* The layout in the export for MultiVectors is as follows:
1913 imports = { all of the data from row exportLIDs.front() ;
1914 ....
1915 all of the data from row exportLIDs.back() }
1916 This doesn't have the best locality, but is necessary because
1917 the data for a Packet (all data associated with an LID) is
1918 required to be contiguous. */
1919
1920 if (numVecs > 0 && importLIDs.extent(0) > 0) {
1921 using dev_exec_space = typename dual_view_type::t_dev::execution_space;
1922 using host_exec_space = typename dual_view_type::t_host::execution_space;
1923
1924 // This fixes GitHub Issue #4418.
1925 const bool use_atomic_updates = unpackOnHost ? host_exec_space().concurrency() != 1 : dev_exec_space().concurrency() != 1;
1926
1927 if (printDebugOutput) {
1928 std::ostringstream os;
1929 os << *prefix << "Unpack: " << combineModeToString(CM) << endl;
1930 std::cerr << os.str();
1931 }
1932
1933 // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
1934 // custom combine modes, start editing here.
1935
1936 if (CM == INSERT || CM == REPLACE) {
1937 using op_type = KokkosRefactor::Details::InsertOp;
1938 if (isConstantStride()) {
1939 if (unpackOnHost) {
1940 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1941 unpack_array_multi_column(host_exec_space(),
1943 op_type(), numVecs,
1946
1947 } else { // unpack on device
1948 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1949 unpack_array_multi_column(space,
1951 op_type(), numVecs,
1954 }
1955 } else { // not constant stride
1956 if (unpackOnHost) {
1957 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1958 unpack_array_multi_column_variable_stride(host_exec_space(),
1959 X_h, imports_h,
1962 op_type(),
1963 numVecs,
1966 } else { // unpack on device
1967 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1968 unpack_array_multi_column_variable_stride(space,
1969 X_d, imports_d,
1972 op_type(),
1973 numVecs,
1976 }
1977 }
1978 } else if (CM == ADD || CM == ADD_ASSIGN) {
1979 using op_type = KokkosRefactor::Details::AddOp;
1980 if (isConstantStride()) {
1981 if (unpackOnHost) {
1982 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1983 unpack_array_multi_column(host_exec_space(),
1985 op_type(), numVecs,
1988 } else { // unpack on device
1989 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1990 unpack_array_multi_column(space,
1992 op_type(), numVecs,
1995 }
1996 } else { // not constant stride
1997 if (unpackOnHost) {
1998 auto X_h = this->getLocalViewHost(Access::ReadWrite);
1999 unpack_array_multi_column_variable_stride(host_exec_space(),
2000 X_h, imports_h,
2003 op_type(),
2004 numVecs,
2007 } else { // unpack on device
2008 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2009 unpack_array_multi_column_variable_stride(space,
2010 X_d, imports_d,
2013 op_type(),
2014 numVecs,
2017 }
2018 }
2019 } else if (CM == ABSMAX) {
2020 using op_type = KokkosRefactor::Details::AbsMaxOp;
2021 if (isConstantStride()) {
2022 if (unpackOnHost) {
2023 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2024 unpack_array_multi_column(host_exec_space(),
2026 op_type(), numVecs,
2029 } else { // unpack on device
2030 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2031 unpack_array_multi_column(space,
2033 op_type(), numVecs,
2036 }
2037 } else {
2039 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2040 unpack_array_multi_column_variable_stride(host_exec_space(),
2041 X_h, imports_h,
2044 op_type(),
2045 numVecs,
2048 } else { // unpack on device
2049 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2050 unpack_array_multi_column_variable_stride(space,
2051 X_d, imports_d,
2054 op_type(),
2055 numVecs,
2058 }
2059 }
2060 } else {
2061 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, "Invalid CombineMode");
2062 }
2063 } else {
2064 if (printDebugOutput) {
2065 std::ostringstream os;
2066 os << *prefix << "Nothing to unpack" << endl;
2067 std::cerr << os.str();
2068 }
2069 }
2070
2071 if (printDebugOutput) {
2072 std::ostringstream os;
2073 os << *prefix << "Done!" << endl;
2074 std::cerr << os.str();
2075 }
2076}
2077
2078template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2080 unpackAndCombine(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
2081 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
2082 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
2083 const size_t constantNumPackets,
2084 const CombineMode CM) {
2085 unpackAndCombine(importLIDs, imports, numPacketsPerLID, constantNumPackets, CM, execution_space());
2086}
2088template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2089size_t
2091 getNumVectors() const {
2092 if (isConstantStride()) {
2093 return static_cast<size_t>(view_.extent(1));
2094 } else {
2095 return static_cast<size_t>(whichVectors_.size());
2096 }
2097}
2098
2099namespace { // (anonymous)
2100
2101template <class RV>
2102void gblDotImpl(const RV& dotsOut,
2103 const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2104 const bool distributed) {
2105 using Teuchos::REDUCE_MAX;
2106 using Teuchos::REDUCE_SUM;
2107 using Teuchos::reduceAll;
2108 typedef typename RV::non_const_value_type dot_type;
2109
2110 const size_t numVecs = dotsOut.extent(0);
2111
2112 // If the MultiVector is distributed over multiple processes, do
2113 // the distributed (interprocess) part of the dot product. We
2114 // assume that the MPI implementation can read from and write to
2115 // device memory.
2116 //
2117 // replaceMap() may have removed some processes. Those
2118 // processes have a null Map. They must not participate in any
2119 // collective operations. We ask first whether the Map is null,
2120 // because isDistributed() defers that question to the Map. We
2121 // still compute and return local dots for processes not
2122 // participating in collective operations; those probably don't
2123 // make any sense, but it doesn't hurt to do them, since it's
2124 // illegal to call dot() on those processes anyway.
2125 if (distributed && !comm.is_null()) {
2126 // The calling process only participates in the collective if
2127 // both the Map and its Comm on that process are nonnull.
2128 const int nv = static_cast<int>(numVecs);
2130
2131 if (commIsInterComm) {
2132 // If comm is an intercomm, then we may not alias input and
2133 // output buffers, so we have to make a copy of the local
2134 // sum.
2135 typename RV::non_const_type lclDots(Kokkos::ViewAllocateWithoutInitializing("tmp"), numVecs);
2136 // DEEP_COPY REVIEW - NOT TESTED
2137 Kokkos::deep_copy(lclDots, dotsOut);
2138 const dot_type* const lclSum = lclDots.data();
2139 dot_type* const gblSum = dotsOut.data();
2141 } else {
2142 dot_type* const inout = dotsOut.data();
2144 }
2145 }
2146}
2147} // namespace
2148
2149template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2152 const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const {
2153 using Kokkos::subview;
2154 using Teuchos::Comm;
2155 using Teuchos::null;
2156 using Teuchos::RCP;
2157 using ::Tpetra::Details::Behavior;
2158 // View of all the dot product results.
2159 typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
2160 typedef typename dual_view_type::t_dev_const XMV;
2161 const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
2162
2163 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::dot (Kokkos::View)");
2164
2165 const size_t numVecs = this->getNumVectors();
2166 if (numVecs == 0) {
2167 return; // nothing to do
2168 }
2169 const size_t lclNumRows = this->getLocalLength();
2170 const size_t numDots = static_cast<size_t>(dots.extent(0));
2171 const bool debug = Behavior::debug();
2172
2173 if (debug) {
2174 const bool compat = this->getMap()->isCompatible(*(A.getMap()));
2175 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!compat, std::invalid_argument,
2176 "'*this' MultiVector is not "
2177 "compatible with the input MultiVector A. We only test for this "
2178 "in debug mode.");
2179 }
2180
2181 // FIXME (mfh 11 Jul 2014) These exception tests may not
2182 // necessarily be thrown on all processes consistently. We should
2183 // instead pass along error state with the inner product. We
2184 // could do this by setting an extra slot to
2185 // Kokkos::ArithTraits<dot_type>::one() on error. The
2186 // final sum should be
2187 // Kokkos::ArithTraits<dot_type>::zero() if not error.
2189 lclNumRows != A.getLocalLength(), std::runtime_error,
2190 "MultiVectors do not have the same local length. "
2191 "this->getLocalLength() = "
2192 << lclNumRows << " != "
2193 "A.getLocalLength() = "
2194 << A.getLocalLength() << ".");
2196 numVecs != A.getNumVectors(), std::runtime_error,
2197 "MultiVectors must have the same number of columns (vectors). "
2198 "this->getNumVectors() = "
2199 << numVecs << " != "
2200 "A.getNumVectors() = "
2201 << A.getNumVectors() << ".");
2203 numDots != numVecs, std::runtime_error,
2204 "The output array 'dots' must have the same number of entries as the "
2205 "number of columns (vectors) in *this and A. dots.extent(0) = "
2206 << numDots << " != this->getNumVectors() = " << numVecs << ".");
2207
2208 const std::pair<size_t, size_t> colRng(0, numVecs);
2210 RCP<const Comm<int> > comm = this->getMap().is_null() ? null : this->getMap()->getComm();
2211
2212 auto thisView = this->getLocalViewDevice(Access::ReadOnly);
2213 auto A_view = A.getLocalViewDevice(Access::ReadOnly);
2214
2215 ::Tpetra::Details::lclDot<RV, XMV>(dotsOut, thisView, A_view, lclNumRows, numVecs,
2216 this->whichVectors_.getRawPtr(),
2217 A.whichVectors_.getRawPtr(),
2218 this->isConstantStride(), A.isConstantStride());
2220 // lbv 15 mar 2023: Kokkos Kernels provides non-blocking BLAS
2221 // functions unless they explicitely return a value to Host.
2222 // Here while the lclDot are on host, they are not a return
2223 // value, therefore they might be avaible to us immediately.
2224 // Adding a frnce here guarantees that we will have the lclDot
2225 // ahead of the MPI reduction.
2227 exec_space_instance.fence("Tpetra::MultiVector::dot");
2229 gblDotImpl(dotsOut, comm, this->isDistributed());
2230}
2231
2232namespace { // (anonymous)
2233template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2237 using ::Tpetra::Details::ProfilingRegion;
2239 using dot_type = typename MV::dot_type;
2240 ProfilingRegion region("Tpetra::multiVectorSingleColumnDot");
2241
2242 auto map = x.getMap();
2243 Teuchos::RCP<const Teuchos::Comm<int> > comm =
2244 map.is_null() ? Teuchos::null : map->getComm();
2245 if (comm.is_null()) {
2246#if KOKKOS_VERSION >= 40799
2247 return KokkosKernels::ArithTraits<dot_type>::zero();
2248#else
2249 return Kokkos::ArithTraits<dot_type>::zero();
2250#endif
2251 } else {
2252 using LO = LocalOrdinal;
2253 // The min just ensures that we don't overwrite memory that
2254 // doesn't belong to us, in the erroneous input case where x
2255 // and y have different numbers of rows.
2256 const LO lclNumRows = static_cast<LO>(std::min(x.getLocalLength(),
2257 y.getLocalLength()));
2258 const Kokkos::pair<LO, LO> rowRng(0, lclNumRows);
2259#if KOKKOS_VERSION >= 40799
2260 dot_type lclDot = KokkosKernels::ArithTraits<dot_type>::zero();
2261#else
2262 dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero();
2263#endif
2264#if KOKKOS_VERSION >= 40799
2265 dot_type gblDot = KokkosKernels::ArithTraits<dot_type>::zero();
2266#else
2267 dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero();
2268#endif
2269
2270 // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2271 // const_cast<MV&>(x).sync_device ();
2272 // const_cast<MV&>(y).sync_device ();
2273
2274 auto x_2d = x.getLocalViewDevice(Access::ReadOnly);
2275 auto x_1d = Kokkos::subview(x_2d, rowRng, 0);
2276 auto y_2d = y.getLocalViewDevice(Access::ReadOnly);
2277 auto y_1d = Kokkos::subview(y_2d, rowRng, 0);
2278 lclDot = KokkosBlas::dot(x_1d, y_1d);
2279
2280 if (x.isDistributed()) {
2281 using Teuchos::outArg;
2282 using Teuchos::REDUCE_SUM;
2283 using Teuchos::reduceAll;
2285 } else {
2286 gblDot = lclDot;
2287 }
2288 return gblDot;
2289 }
2291} // namespace
2292
2293template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2296 const Teuchos::ArrayView<dot_type>& dots) const {
2297 const char tfecfFuncName[] = "dot: ";
2298 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::dot (Teuchos::ArrayView)");
2299
2300 const size_t numVecs = this->getNumVectors();
2301 const size_t lclNumRows = this->getLocalLength();
2302 const size_t numDots = static_cast<size_t>(dots.size());
2303
2304 // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2305 // not necessarily be thrown on all processes consistently. We
2306 // keep them for now, because MultiVector's unit tests insist on
2307 // them. In the future, we should instead pass along error state
2308 // with the inner product. We could do this by setting an extra
2309 // slot to Kokkos::ArithTraits<dot_type>::one() on error.
2310 // The final sum should be
2311 // Kokkos::ArithTraits<dot_type>::zero() if not error.
2312 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclNumRows != A.getLocalLength(), std::runtime_error,
2313 "MultiVectors do not have the same local length. "
2314 "this->getLocalLength() = "
2315 << lclNumRows << " != "
2316 "A.getLocalLength() = "
2317 << A.getLocalLength() << ".");
2318 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numVecs != A.getNumVectors(), std::runtime_error,
2319 "MultiVectors must have the same number of columns (vectors). "
2320 "this->getNumVectors() = "
2321 << numVecs << " != "
2322 "A.getNumVectors() = "
2323 << A.getNumVectors() << ".");
2325 "The output array 'dots' must have the same number of entries as the "
2326 "number of columns (vectors) in *this and A. dots.extent(0) = "
2327 << numDots << " != this->getNumVectors() = " << numVecs << ".");
2328
2329 if (numVecs == 1 && this->isConstantStride() && A.isConstantStride()) {
2331 dots[0] = gblDot;
2332 } else {
2333 this->dot(A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr(), numDots));
2334 }
2335}
2336
2337template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2339 norm2(const Teuchos::ArrayView<mag_type>& norms) const {
2341 using ::Tpetra::Details::NORM_TWO;
2342 using ::Tpetra::Details::ProfilingRegion;
2343 ProfilingRegion region("Tpetra::MV::norm2 (host output)");
2344
2345 // The function needs to be able to sync X.
2346 MV& X = const_cast<MV&>(*this);
2347 multiVectorNormImpl(norms.getRawPtr(), X, NORM_TWO);
2348}
2349
2350template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2352 norm2(const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const {
2353 Teuchos::ArrayView<mag_type> norms_av(norms.data(), norms.extent(0));
2354 this->norm2(norms_av);
2355}
2356
2357template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2359 norm1(const Teuchos::ArrayView<mag_type>& norms) const {
2361 using ::Tpetra::Details::NORM_ONE;
2362 using ::Tpetra::Details::ProfilingRegion;
2363 ProfilingRegion region("Tpetra::MV::norm1 (host output)");
2364
2365 // The function needs to be able to sync X.
2366 MV& X = const_cast<MV&>(*this);
2367 multiVectorNormImpl(norms.data(), X, NORM_ONE);
2368}
2369
2370template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2372 norm1(const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const {
2373 Teuchos::ArrayView<mag_type> norms_av(norms.data(), norms.extent(0));
2374 this->norm1(norms_av);
2375}
2376
2377template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2379 normInf(const Teuchos::ArrayView<mag_type>& norms) const {
2381 using ::Tpetra::Details::NORM_INF;
2382 using ::Tpetra::Details::ProfilingRegion;
2383 ProfilingRegion region("Tpetra::MV::normInf (host output)");
2384
2385 // The function needs to be able to sync X.
2386 MV& X = const_cast<MV&>(*this);
2387 multiVectorNormImpl(norms.getRawPtr(), X, NORM_INF);
2388}
2389
2390template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2392 normInf(const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const {
2393 Teuchos::ArrayView<mag_type> norms_av(norms.data(), norms.extent(0));
2394 this->normInf(norms_av);
2395}
2396
2397template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2399 meanValue(const Teuchos::ArrayView<impl_scalar_type>& means) const {
2400 // KR FIXME Overload this method to take a View.
2401 using Kokkos::ALL;
2402 using Kokkos::subview;
2403 using Teuchos::Comm;
2404 using Teuchos::RCP;
2405 using Teuchos::REDUCE_SUM;
2406 using Teuchos::reduceAll;
2407#if KOKKOS_VERSION >= 40799
2408 typedef KokkosKernels::ArithTraits<impl_scalar_type> ATS;
2409#else
2410 typedef Kokkos::ArithTraits<impl_scalar_type> ATS;
2411#endif
2412
2413 const size_t lclNumRows = this->getLocalLength();
2414 const size_t numVecs = this->getNumVectors();
2415 const size_t numMeans = static_cast<size_t>(means.size());
2416
2418 numMeans != numVecs, std::runtime_error,
2419 "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2420 << " != this->getNumVectors() = " << numVecs << ".");
2421
2422 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2423 const std::pair<size_t, size_t> colRng(0, numVecs);
2424
2425 // Make sure that the final output view has the same layout as the
2426 // temporary view's host_mirror_type. Left or Right doesn't matter for
2427 // a 1-D array anyway; this is just to placate the compiler.
2428 typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2429 typedef Kokkos::View<impl_scalar_type*,
2430 typename local_view_type::host_mirror_type::array_layout,
2431 Kokkos::HostSpace,
2432 Kokkos::MemoryTraits<Kokkos::Unmanaged> >
2435
2436 RCP<const Comm<int> > comm = this->getMap().is_null() ? Teuchos::null : this->getMap()->getComm();
2437
2438 // If we need sync to device, then host has the most recent version.
2439 const bool useHostVersion = this->need_sync_device();
2440 if (useHostVersion) {
2441 // DualView was last modified on host, so run the local kernel there.
2442 auto X_lcl = subview(getLocalViewHost(Access::ReadOnly),
2443 rowRng, Kokkos::ALL());
2444 // Compute the local sum of each column.
2445 Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums("MV::meanValue tmp", numVecs);
2446 if (isConstantStride()) {
2447 KokkosBlas::sum(lclSums, X_lcl);
2448 } else {
2449 for (size_t j = 0; j < numVecs; ++j) {
2450 const size_t col = whichVectors_[j];
2451 KokkosBlas::sum(subview(lclSums, j), subview(X_lcl, ALL(), col));
2452 }
2453 }
2454
2455 // If there are multiple MPI processes, the all-reduce reads
2456 // from lclSums, and writes to meansOut. Otherwise, we just
2457 // copy lclSums into meansOut.
2458 if (!comm.is_null() && this->isDistributed()) {
2459 reduceAll(*comm, REDUCE_SUM, static_cast<int>(numVecs),
2460 lclSums.data(), meansOut.data());
2461 } else {
2462 // DEEP_COPY REVIEW - NOT TESTED
2463 Kokkos::deep_copy(meansOut, lclSums);
2464 }
2465 } else {
2466 // DualView was last modified on device, so run the local kernel there.
2467 auto X_lcl = subview(this->getLocalViewDevice(Access::ReadOnly),
2468 rowRng, Kokkos::ALL());
2469
2470 // Compute the local sum of each column.
2471 Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums("MV::meanValue tmp", numVecs);
2472 if (isConstantStride()) {
2473 KokkosBlas::sum(lclSums, X_lcl);
2474 } else {
2475 for (size_t j = 0; j < numVecs; ++j) {
2476 const size_t col = whichVectors_[j];
2477 KokkosBlas::sum(subview(lclSums, j), subview(X_lcl, ALL(), col));
2478 }
2479 }
2480 // lbv 10 mar 2023: Kokkos Kernels provides non-blocking BLAS
2481 // functions unless they explicitly return a value to Host.
2482 // Here while the lclSums are on the host, they are not a return
2483 // value, therefore they might be available to us immediately.
2484 // Adding a fence here guarantees that we will have the lclSums
2485 // ahead of the MPI reduction.
2487 exec_space_instance.fence("Tpetra::MultiVector::mean");
2488
2489 // If there are multiple MPI processes, the all-reduce reads
2490 // from lclSums, and writes to meansOut. (We assume that MPI
2491 // can read device memory.) Otherwise, we just copy lclSums
2492 // into meansOut.
2493 if (!comm.is_null() && this->isDistributed()) {
2494 reduceAll(*comm, REDUCE_SUM, static_cast<int>(numVecs),
2495 lclSums.data(), meansOut.data());
2496 } else {
2497 // DEEP_COPY REVIEW - HOST-TO-HOST - NOT TESTED FOR MPI BUILD
2498 Kokkos::deep_copy(meansOut, lclSums);
2499 }
2500 }
2501
2502 // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2503 // to the magnitude type, since operator/ (std::complex<T>, int)
2504 // isn't necessarily defined.
2506 ATS::one() / static_cast<mag_type>(this->getGlobalLength());
2507 for (size_t k = 0; k < numMeans; ++k) {
2509 }
2510}
2511
2512template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2514 randomize() {
2515 typedef impl_scalar_type IST;
2516#if KOKKOS_VERSION >= 40799
2517 typedef KokkosKernels::ArithTraits<IST> ATS;
2518#else
2519 typedef Kokkos::ArithTraits<IST> ATS;
2520#endif
2521 typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2522 typedef typename pool_type::generator_type generator_type;
2523
2524 const IST max = Kokkos::rand<generator_type, IST>::max();
2525 const IST min = ATS::is_signed ? IST(-max) : ATS::zero();
2526
2527 this->randomize(static_cast<Scalar>(min), static_cast<Scalar>(max));
2528}
2529
2530template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2532 randomize(const Scalar& minVal, const Scalar& maxVal) {
2533 typedef impl_scalar_type IST;
2534 typedef Tpetra::Details::Static_Random_XorShift64_Pool<typename device_type::execution_space> tpetra_pool_type;
2535 typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2536
2537 // Seed the pool based on the system RNG and the MPI rank, if needed
2538 if (!tpetra_pool_type::isSet())
2539 tpetra_pool_type::resetPool(this->getMap()->getComm()->getRank());
2540
2541 pool_type& rand_pool = tpetra_pool_type::getPool();
2542 const IST max = static_cast<IST>(maxVal);
2543 const IST min = static_cast<IST>(minVal);
2544
2545 auto thisView = this->getLocalViewDevice(Access::OverwriteAll);
2546
2547 if (isConstantStride()) {
2548 Kokkos::fill_random(thisView, rand_pool, min, max);
2549 } else {
2550 const size_t numVecs = getNumVectors();
2551 for (size_t k = 0; k < numVecs; ++k) {
2552 const size_t col = whichVectors_[k];
2553 auto X_k = Kokkos::subview(thisView, Kokkos::ALL(), col);
2554 Kokkos::fill_random(X_k, rand_pool, min, max);
2555 }
2556 }
2557}
2558
2559template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2561 putScalar(const Scalar& alpha) {
2562 using ::Tpetra::Details::ProfilingRegion;
2563 using ::Tpetra::Details::Blas::fill;
2564 using DES = typename dual_view_type::t_dev::execution_space;
2565 using HES = typename dual_view_type::t_host::execution_space;
2566 using LO = LocalOrdinal;
2567 ProfilingRegion region("Tpetra::MultiVector::putScalar");
2568
2569 // We need this cast for cases like Scalar = std::complex<T> but
2570 // impl_scalar_type = Kokkos::complex<T>.
2571 const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
2572 const LO lclNumRows = static_cast<LO>(this->getLocalLength());
2573 const LO numVecs = static_cast<LO>(this->getNumVectors());
2574
2575 // Modify the most recently updated version of the data. This
2576 // avoids sync'ing, which could violate users' expectations.
2577 //
2578 // If we need sync to device, then host has the most recent version.
2579 const bool runOnHost = runKernelOnHost(*this);
2580 if (!runOnHost) {
2581 auto X = this->getLocalViewDevice(Access::OverwriteAll);
2582 if (this->isConstantStride()) {
2583 fill(DES(), X, theAlpha, lclNumRows, numVecs);
2584 } else {
2585 fill(DES(), X, theAlpha, lclNumRows, numVecs,
2586 this->whichVectors_.getRawPtr());
2587 }
2588 } else { // last modified in host memory, so modify data there.
2589 auto X = this->getLocalViewHost(Access::OverwriteAll);
2590 if (this->isConstantStride()) {
2591 fill(HES(), X, theAlpha, lclNumRows, numVecs);
2592 } else {
2593 fill(HES(), X, theAlpha, lclNumRows, numVecs,
2594 this->whichVectors_.getRawPtr());
2595 }
2596 }
2597}
2598
2599template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2601 replaceMap(const Teuchos::RCP<const map_type>& newMap) {
2602 using Teuchos::ArrayRCP;
2603 using Teuchos::Comm;
2604 using Teuchos::RCP;
2605 using ST = Scalar;
2606 using LO = LocalOrdinal;
2607 using GO = GlobalOrdinal;
2608
2609 // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2610 // it might work if the MV is a column view of another MV.
2611 // However, things might go wrong when restoring the original
2612 // Map, so we don't allow this case for now.
2614 !this->isConstantStride(), std::logic_error,
2615 "Tpetra::MultiVector::replaceMap: This method does not currently work "
2616 "if the MultiVector is a column view of another MultiVector (that is, if "
2617 "isConstantStride() == false).");
2618
2619 // Case 1: current Map and new Map are both nonnull on this process.
2620 // Case 2: current Map is nonnull, new Map is null.
2621 // Case 3: current Map is null, new Map is nonnull.
2622 // Case 4: both Maps are null: forbidden.
2623 //
2624 // Case 1 means that we don't have to do anything on this process,
2625 // other than assign the new Map. (We always have to do that.)
2626 // It's an error for the user to supply a Map that requires
2627 // resizing in this case.
2628 //
2629 // Case 2 means that the calling process is in the current Map's
2630 // communicator, but will be excluded from the new Map's
2631 // communicator. We don't have to do anything on the calling
2632 // process; just leave whatever data it may have alone.
2633 //
2634 // Case 3 means that the calling process is excluded from the
2635 // current Map's communicator, but will be included in the new
2636 // Map's communicator. This means we need to (re)allocate the
2637 // local DualView if it does not have the right number of rows.
2638 // If the new number of rows is nonzero, we'll fill the newly
2639 // allocated local data with zeros, as befits a projection
2640 // operation.
2641 //
2642 // The typical use case for Case 3 is that the MultiVector was
2643 // first created with the Map with more processes, then that Map
2644 // was replaced with a Map with fewer processes, and finally the
2645 // original Map was restored on this call to replaceMap.
2646
2647 if (this->getMap().is_null()) { // current Map is null
2648 // If this->getMap() is null, that means that this MultiVector
2649 // has already had replaceMap happen to it. In that case, just
2650 // reallocate the DualView with the right size.
2651
2653 newMap.is_null(), std::invalid_argument,
2654 "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2655 "This probably means that the input Map is incorrect.");
2656
2657 // Case 3: current Map is null, new Map is nonnull.
2658 // Reallocate the DualView with the right dimensions.
2659 const size_t newNumRows = newMap->getLocalNumElements();
2660 const size_t origNumRows = view_.extent(0);
2661 const size_t numCols = this->getNumVectors();
2662
2663 if (origNumRows != newNumRows || view_.extent(1) != numCols) {
2665 }
2666 } else if (newMap.is_null()) { // Case 2: current Map is nonnull, new Map is null
2667 // I am an excluded process. Reinitialize my data so that I
2668 // have 0 rows. Keep the number of columns as before.
2669 const size_t newNumRows = static_cast<size_t>(0);
2670 const size_t numCols = this->getNumVectors();
2672 }
2673
2674 this->map_ = newMap;
2675}
2676
2677template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2679 scale(const Scalar& alpha) {
2680 using Kokkos::ALL;
2681 using IST = impl_scalar_type;
2682
2683 const IST theAlpha = static_cast<IST>(alpha);
2684#if KOKKOS_VERSION >= 40799
2685 if (theAlpha == KokkosKernels::ArithTraits<IST>::one()) {
2686#else
2687 if (theAlpha == Kokkos::ArithTraits<IST>::one()) {
2688#endif
2689 return; // do nothing
2690 }
2691 const size_t lclNumRows = getLocalLength();
2692 const size_t numVecs = getNumVectors();
2693 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2694 const std::pair<size_t, size_t> colRng(0, numVecs);
2695
2696 // We can't substitute putScalar(0.0) for scale(0.0), because the
2697 // former will overwrite NaNs present in the MultiVector. The
2698 // semantics of this call require multiplying them by 0, which
2699 // IEEE 754 requires to be NaN.
2700
2701 // If we need sync to device, then host has the most recent version.
2702 const bool useHostVersion = need_sync_device();
2703 if (useHostVersion) {
2704 auto Y_lcl = Kokkos::subview(getLocalViewHost(Access::ReadWrite), rowRng, ALL());
2705 if (isConstantStride()) {
2706 KokkosBlas::scal(Y_lcl, theAlpha, Y_lcl);
2707 } else {
2708 for (size_t k = 0; k < numVecs; ++k) {
2709 const size_t Y_col = whichVectors_[k];
2710 auto Y_k = Kokkos::subview(Y_lcl, ALL(), Y_col);
2711 KokkosBlas::scal(Y_k, theAlpha, Y_k);
2712 }
2713 }
2714 } else { // work on device
2715 auto Y_lcl = Kokkos::subview(getLocalViewDevice(Access::ReadWrite), rowRng, ALL());
2716 if (isConstantStride()) {
2717 KokkosBlas::scal(Y_lcl, theAlpha, Y_lcl);
2718 } else {
2719 for (size_t k = 0; k < numVecs; ++k) {
2720 const size_t Y_col = isConstantStride() ? k : whichVectors_[k];
2721 auto Y_k = Kokkos::subview(Y_lcl, ALL(), Y_col);
2722 KokkosBlas::scal(Y_k, theAlpha, Y_k);
2723 }
2724 }
2725 }
2726}
2727
2728template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2730 scale(const Teuchos::ArrayView<const Scalar>& alphas) {
2731 const size_t numVecs = this->getNumVectors();
2732 const size_t numAlphas = static_cast<size_t>(alphas.size());
2734 numAlphas != numVecs, std::invalid_argument,
2735 "Tpetra::MultiVector::"
2736 "scale: alphas.size() = "
2737 << numAlphas << " != this->getNumVectors() = "
2738 << numVecs << ".");
2739
2740 // Use a DualView to copy the scaling constants onto the device.
2741 using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2742 k_alphas_type k_alphas("alphas::tmp", numAlphas);
2743 k_alphas.modify_host();
2744 for (size_t i = 0; i < numAlphas; ++i) {
2745 k_alphas.view_host()(i) = static_cast<impl_scalar_type>(alphas[i]);
2746 }
2747 k_alphas.sync_device();
2748 // Invoke the scale() overload that takes a device View of coefficients.
2749 this->scale(k_alphas.view_device());
2750}
2751
2752template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2754 scale(const Kokkos::View<const impl_scalar_type*, device_type>& alphas) {
2755 using Kokkos::ALL;
2756 using Kokkos::subview;
2757
2758 const size_t lclNumRows = this->getLocalLength();
2759 const size_t numVecs = this->getNumVectors();
2761 static_cast<size_t>(alphas.extent(0)) != numVecs,
2762 std::invalid_argument,
2763 "Tpetra::MultiVector::scale(alphas): "
2764 "alphas.extent(0) = "
2765 << alphas.extent(0)
2766 << " != this->getNumVectors () = " << numVecs << ".");
2767 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2768 const std::pair<size_t, size_t> colRng(0, numVecs);
2769
2770 // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2771 // type of the return value of subview. This is because if we
2772 // switch the array layout from LayoutLeft to LayoutRight
2773 // (preferred for performance of block operations), the types
2774 // below won't be valid. (A view of a column of a LayoutRight
2775 // multivector has LayoutStride, not LayoutLeft.)
2776
2777 // If we need sync to device, then host has the most recent version.
2778 const bool useHostVersion = this->need_sync_device();
2779 if (useHostVersion) {
2780 // Work in host memory. This means we need to create a host
2781 // mirror of the input View of coefficients.
2782 auto alphas_h = Kokkos::create_mirror_view(alphas);
2783 // DEEP_COPY REVIEW - NOT TESTED
2784 Kokkos::deep_copy(alphas_h, alphas);
2785
2786 auto Y_lcl = subview(this->getLocalViewHost(Access::ReadWrite), rowRng, ALL());
2787 if (isConstantStride()) {
2788 KokkosBlas::scal(Y_lcl, alphas_h, Y_lcl);
2789 } else {
2790 for (size_t k = 0; k < numVecs; ++k) {
2791 const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2792 auto Y_k = subview(Y_lcl, ALL(), Y_col);
2793 // We don't have to use the entire 1-D View here; we can use
2794 // the version that takes a scalar coefficient.
2795 KokkosBlas::scal(Y_k, alphas_h(k), Y_k);
2796 }
2797 }
2798 } else { // Work in device memory, using the input View 'alphas' directly.
2799 auto Y_lcl = subview(this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL());
2800 if (isConstantStride()) {
2801 KokkosBlas::scal(Y_lcl, alphas, Y_lcl);
2802 } else {
2803 // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2804 // as values on host, so copy them to host. Another approach
2805 // would be to fix scal() so that it takes a 0-D View as the
2806 // second argument.
2807 auto alphas_h = Kokkos::create_mirror_view(alphas);
2808 // DEEP_COPY REVIEW - NOT TESTED
2809 Kokkos::deep_copy(alphas_h, alphas);
2810
2811 for (size_t k = 0; k < numVecs; ++k) {
2812 const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2813 auto Y_k = subview(Y_lcl, ALL(), Y_col);
2814 KokkosBlas::scal(Y_k, alphas_h(k), Y_k);
2815 }
2816 }
2817 }
2818}
2819
2820template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2822 scale(const Scalar& alpha,
2824 using Kokkos::ALL;
2825 using Kokkos::subview;
2826 const char tfecfFuncName[] = "scale: ";
2827
2828 const size_t lclNumRows = getLocalLength();
2829 const size_t numVecs = getNumVectors();
2830
2832 lclNumRows != A.getLocalLength(), std::invalid_argument,
2833 "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2834 << A.getLocalLength() << ".");
2836 numVecs != A.getNumVectors(), std::invalid_argument,
2837 "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2838 << A.getNumVectors() << ".");
2839
2840 const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
2841 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2842 const std::pair<size_t, size_t> colRng(0, numVecs);
2843
2844 auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2845 auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2846 auto Y_lcl = subview(Y_lcl_orig, rowRng, ALL());
2847 auto X_lcl = subview(X_lcl_orig, rowRng, ALL());
2848
2849 if (isConstantStride() && A.isConstantStride()) {
2850 KokkosBlas::scal(Y_lcl, theAlpha, X_lcl);
2851 } else {
2852 // Make sure that Kokkos only uses the local length for add.
2853 for (size_t k = 0; k < numVecs; ++k) {
2854 const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2855 const size_t X_col = A.isConstantStride() ? k : A.whichVectors_[k];
2856 auto Y_k = subview(Y_lcl, ALL(), Y_col);
2857 auto X_k = subview(X_lcl, ALL(), X_col);
2858
2859 KokkosBlas::scal(Y_k, theAlpha, X_k);
2860 }
2861 }
2862}
2863
2864template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2867 const char tfecfFuncName[] = "reciprocal: ";
2868
2870 getLocalLength() != A.getLocalLength(), std::runtime_error,
2871 "MultiVectors do not have the same local length. "
2872 "this->getLocalLength() = "
2873 << getLocalLength()
2874 << " != A.getLocalLength() = " << A.getLocalLength() << ".");
2876 A.getNumVectors() != this->getNumVectors(), std::runtime_error,
2877 ": MultiVectors do not have the same number of columns (vectors). "
2878 "this->getNumVectors() = "
2879 << getNumVectors()
2880 << " != A.getNumVectors() = " << A.getNumVectors() << ".");
2881
2882 const size_t numVecs = getNumVectors();
2883
2884 auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
2885 auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
2886
2887 if (isConstantStride() && A.isConstantStride()) {
2888 KokkosBlas::reciprocal(this_view_dev, A_view_dev);
2889 } else {
2890 using Kokkos::ALL;
2891 using Kokkos::subview;
2892 for (size_t k = 0; k < numVecs; ++k) {
2893 const size_t this_col = isConstantStride() ? k : whichVectors_[k];
2895 const size_t A_col = isConstantStride() ? k : A.whichVectors_[k];
2896 auto vector_Ak = subview(A_view_dev, ALL(), A_col);
2897 KokkosBlas::reciprocal(vector_k, vector_Ak);
2898 }
2899 }
2900}
2901
2902template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2905 const char tfecfFuncName[] = "abs";
2906
2908 getLocalLength() != A.getLocalLength(), std::runtime_error,
2909 ": MultiVectors do not have the same local length. "
2910 "this->getLocalLength() = "
2911 << getLocalLength()
2912 << " != A.getLocalLength() = " << A.getLocalLength() << ".");
2914 A.getNumVectors() != this->getNumVectors(), std::runtime_error,
2915 ": MultiVectors do not have the same number of columns (vectors). "
2916 "this->getNumVectors() = "
2917 << getNumVectors()
2918 << " != A.getNumVectors() = " << A.getNumVectors() << ".");
2919 const size_t numVecs = getNumVectors();
2920
2921 auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
2922 auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
2923
2924 if (isConstantStride() && A.isConstantStride()) {
2925 KokkosBlas::abs(this_view_dev, A_view_dev);
2926 } else {
2927 using Kokkos::ALL;
2928 using Kokkos::subview;
2929
2930 for (size_t k = 0; k < numVecs; ++k) {
2931 const size_t this_col = isConstantStride() ? k : whichVectors_[k];
2933 const size_t A_col = isConstantStride() ? k : A.whichVectors_[k];
2934 auto vector_Ak = subview(A_view_dev, ALL(), A_col);
2935 KokkosBlas::abs(vector_k, vector_Ak);
2936 }
2937 }
2938}
2939
2940template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2942 update(const Scalar& alpha,
2944 const Scalar& beta) {
2945 const char tfecfFuncName[] = "update: ";
2946 using Kokkos::ALL;
2947 using Kokkos::subview;
2948
2949 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::update(alpha,A,beta)");
2950
2951 const size_t lclNumRows = getLocalLength();
2952 const size_t numVecs = getNumVectors();
2953
2956 lclNumRows != A.getLocalLength(), std::invalid_argument,
2957 "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2958 << A.getLocalLength() << ".");
2960 numVecs != A.getNumVectors(), std::invalid_argument,
2961 "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2962 << A.getNumVectors() << ".");
2963 }
2964
2965 const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
2966 const impl_scalar_type theBeta = static_cast<impl_scalar_type>(beta);
2967 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
2968 const std::pair<size_t, size_t> colRng(0, numVecs);
2969
2970 auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2971 auto Y_lcl = subview(Y_lcl_orig, rowRng, Kokkos::ALL());
2972 auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2973 auto X_lcl = subview(X_lcl_orig, rowRng, Kokkos::ALL());
2974
2975 // The device memory of *this is about to be modified
2976 if (isConstantStride() && A.isConstantStride()) {
2977 KokkosBlas::axpby(theAlpha, X_lcl, theBeta, Y_lcl);
2978 } else {
2979 // Make sure that Kokkos only uses the local length for add.
2980 for (size_t k = 0; k < numVecs; ++k) {
2981 const size_t Y_col = this->isConstantStride() ? k : this->whichVectors_[k];
2982 const size_t X_col = A.isConstantStride() ? k : A.whichVectors_[k];
2983 auto Y_k = subview(Y_lcl, ALL(), Y_col);
2984 auto X_k = subview(X_lcl, ALL(), X_col);
2985
2986 KokkosBlas::axpby(theAlpha, X_k, theBeta, Y_k);
2987 }
2988 }
2989}
2990
2991template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2993 update(const Scalar& alpha,
2995 const Scalar& beta,
2997 const Scalar& gamma) {
2998 using Kokkos::ALL;
2999 using Kokkos::subview;
3000
3001 const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
3002
3003 ::Tpetra::Details::ProfilingRegion region("Tpetra::MV::update(alpha,A,beta,B,gamma)");
3004
3005 const size_t lclNumRows = this->getLocalLength();
3006 const size_t numVecs = getNumVectors();
3007
3010 lclNumRows != A.getLocalLength(), std::invalid_argument,
3011 "The input MultiVector A has " << A.getLocalLength() << " local "
3012 "row(s), but this MultiVector has "
3013 << lclNumRows << " local "
3014 "row(s).");
3016 lclNumRows != B.getLocalLength(), std::invalid_argument,
3017 "The input MultiVector B has " << B.getLocalLength() << " local "
3018 "row(s), but this MultiVector has "
3019 << lclNumRows << " local "
3020 "row(s).");
3022 A.getNumVectors() != numVecs, std::invalid_argument,
3023 "The input MultiVector A has " << A.getNumVectors() << " column(s), "
3024 "but this MultiVector has "
3025 << numVecs << " column(s).");
3027 B.getNumVectors() != numVecs, std::invalid_argument,
3028 "The input MultiVector B has " << B.getNumVectors() << " column(s), "
3029 "but this MultiVector has "
3030 << numVecs << " column(s).");
3031 }
3032
3033 const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
3034 const impl_scalar_type theBeta = static_cast<impl_scalar_type>(beta);
3035 const impl_scalar_type theGamma = static_cast<impl_scalar_type>(gamma);
3036
3037 const std::pair<size_t, size_t> rowRng(0, lclNumRows);
3038 const std::pair<size_t, size_t> colRng(0, numVecs);
3039
3040 // Prefer 'auto' over specifying the type explicitly. This avoids
3041 // issues with a subview possibly having a different type than the
3042 // original view.
3043 auto C_lcl = subview(this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL());
3044 auto A_lcl = subview(A.getLocalViewDevice(Access::ReadOnly), rowRng, ALL());
3045 auto B_lcl = subview(B.getLocalViewDevice(Access::ReadOnly), rowRng, ALL());
3046
3047 if (isConstantStride() && A.isConstantStride() && B.isConstantStride()) {
3048 KokkosBlas::update(theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3049 } else {
3050 // Some input (or *this) is not constant stride,
3051 // so perform the update one column at a time.
3052 for (size_t k = 0; k < numVecs; ++k) {
3053 const size_t this_col = isConstantStride() ? k : whichVectors_[k];
3054 const size_t A_col = A.isConstantStride() ? k : A.whichVectors_[k];
3055 const size_t B_col = B.isConstantStride() ? k : B.whichVectors_[k];
3056 KokkosBlas::update(theAlpha, subview(A_lcl, rowRng, A_col),
3057 theBeta, subview(B_lcl, rowRng, B_col),
3059 }
3060 }
3061}
3062
3063template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3064Teuchos::ArrayRCP<const Scalar>
3066 getData(size_t j) const {
3067 using Kokkos::ALL;
3068 using IST = impl_scalar_type;
3069 const char tfecfFuncName[] = "getData: ";
3070
3071 // Any MultiVector method that called the (classic) Kokkos Node's
3072 // viewBuffer or viewBufferNonConst methods always implied a
3073 // device->host synchronization. Thus, we synchronize here as
3074 // well.
3075
3076 auto hostView = getLocalViewHost(Access::ReadOnly);
3077 const size_t col = isConstantStride() ? j : whichVectors_[j];
3078 auto hostView_j = Kokkos::subview(hostView, ALL(), col);
3079 Teuchos::ArrayRCP<const IST> dataAsArcp =
3080 Kokkos::Compat::persistingView(hostView_j, 0, getLocalLength());
3081
3082 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(hostView_j.extent(0)) <
3083 static_cast<size_t>(dataAsArcp.size()),
3084 std::logic_error,
3085 "hostView_j.extent(0) = " << hostView_j.extent(0)
3086 << " < dataAsArcp.size() = " << dataAsArcp.size() << ". "
3087 "Please report this bug to the Tpetra developers.");
3088
3089 return Teuchos::arcp_reinterpret_cast<const Scalar>(dataAsArcp);
3090}
3091
3092template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3093Teuchos::ArrayRCP<Scalar>
3095 getDataNonConst(size_t j) {
3096 using Kokkos::ALL;
3097 using Kokkos::subview;
3098 using IST = impl_scalar_type;
3099 const char tfecfFuncName[] = "getDataNonConst: ";
3100
3101 auto hostView = getLocalViewHost(Access::ReadWrite);
3102 const size_t col = isConstantStride() ? j : whichVectors_[j];
3103 auto hostView_j = subview(hostView, ALL(), col);
3104 Teuchos::ArrayRCP<IST> dataAsArcp =
3105 Kokkos::Compat::persistingView(hostView_j, 0, getLocalLength());
3106
3107 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(hostView_j.extent(0)) <
3108 static_cast<size_t>(dataAsArcp.size()),
3109 std::logic_error,
3110 "hostView_j.extent(0) = " << hostView_j.extent(0)
3111 << " < dataAsArcp.size() = " << dataAsArcp.size() << ". "
3112 "Please report this bug to the Tpetra developers.");
3113
3114 return Teuchos::arcp_reinterpret_cast<Scalar>(dataAsArcp);
3115}
3116
3117template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3118Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3120 subCopy(const Teuchos::ArrayView<const size_t>& cols) const {
3121 using Teuchos::RCP;
3123
3124 // Check whether the index set in cols is contiguous. If it is,
3125 // use the more efficient Range1D version of subCopy.
3126 bool contiguous = true;
3127 const size_t numCopyVecs = static_cast<size_t>(cols.size());
3128 for (size_t j = 1; j < numCopyVecs; ++j) {
3129 if (cols[j] != cols[j - 1] + static_cast<size_t>(1)) {
3130 contiguous = false;
3131 break;
3132 }
3133 }
3134 if (contiguous && numCopyVecs > 0) {
3135 return this->subCopy(Teuchos::Range1D(cols[0], cols[numCopyVecs - 1]));
3136 } else {
3137 RCP<const MV> X_sub = this->subView(cols);
3138 RCP<MV> Y(new MV(this->getMap(), numCopyVecs, false));
3139 Y->assign(*X_sub);
3140 return Y;
3141 }
3142}
3143
3144template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3145Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3147 subCopy(const Teuchos::Range1D& colRng) const {
3148 using Teuchos::RCP;
3150
3151 RCP<const MV> X_sub = this->subView(colRng);
3152 RCP<MV> Y(new MV(this->getMap(), static_cast<size_t>(colRng.size()), false));
3153 Y->assign(*X_sub);
3154 return Y;
3155}
3156
3157template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3158size_t
3160 getOrigNumLocalRows() const {
3161 return view_.origExtent(0);
3162}
3163
3164template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3165size_t
3167 getOrigNumLocalCols() const {
3168 return view_.origExtent(1);
3169}
3170
3171template <class Scalar, class LO, class GO, class Node>
3174 const Teuchos::RCP<const map_type>& subMap,
3175 const local_ordinal_type rowOffset)
3176 : base_type(subMap) {
3177 using Kokkos::ALL;
3178 using Kokkos::subview;
3179 using std::endl;
3180 using Teuchos::outArg;
3181 using Teuchos::RCP;
3182 using Teuchos::rcp;
3183 using Teuchos::REDUCE_MIN;
3184 using Teuchos::reduceAll;
3186 const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3187 const char suffix[] = "Please report this bug to the Tpetra developers.";
3188 int lclGood = 1;
3189 int gblGood = 1;
3190 std::unique_ptr<std::ostringstream> errStrm;
3191 const bool debug = ::Tpetra::Details::Behavior::debug();
3192 const bool verbose = ::Tpetra::Details::Behavior::verbose();
3193
3194 // Be careful to use the input Map's communicator, not X's. The
3195 // idea is that, on return, *this is a subview of X, using the
3196 // input Map.
3197 const auto comm = subMap->getComm();
3198 TEUCHOS_ASSERT(!comm.is_null());
3199 const int myRank = comm->getRank();
3200
3201 const LO lclNumRowsBefore = static_cast<LO>(X.getLocalLength());
3202 const LO numCols = static_cast<LO>(X.getNumVectors());
3203 const LO newNumRows = static_cast<LO>(subMap->getLocalNumElements());
3204 if (verbose) {
3205 std::ostringstream os;
3206 os << "Proc " << myRank << ": " << prefix
3207 << "X: {lclNumRows: " << lclNumRowsBefore
3208 << ", origLclNumRows: " << X.getOrigNumLocalRows()
3209 << ", numCols: " << numCols << "}, "
3210 << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3211 std::cerr << os.str();
3212 }
3213 // We ask for the _original_ number of rows in X, because X could
3214 // be a shorter (fewer rows) view of a longer MV. (For example, X
3215 // could be a domain Map view of a column Map MV.)
3216 const bool tooManyElts =
3217 newNumRows + rowOffset > static_cast<LO>(X.getOrigNumLocalRows());
3218 if (tooManyElts) {
3219 errStrm = std::unique_ptr<std::ostringstream>(new std::ostringstream);
3220 *errStrm << " Proc " << myRank << ": subMap->getLocalNumElements() (="
3221 << newNumRows << ") + rowOffset (=" << rowOffset
3222 << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows()
3223 << ")." << endl;
3224 lclGood = 0;
3225 TEUCHOS_TEST_FOR_EXCEPTION(!debug && tooManyElts, std::invalid_argument,
3226 prefix << errStrm->str() << suffix);
3227 }
3228
3229 if (debug) {
3231 if (gblGood != 1) {
3232 std::ostringstream gblErrStrm;
3233 const std::string myErrStr =
3234 errStrm.get() != nullptr ? errStrm->str() : std::string("");
3236 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, gblErrStrm.str());
3237 }
3238 }
3239
3240 using range_type = std::pair<LO, LO>;
3241 const range_type origRowRng(rowOffset, static_cast<LO>(X.view_.origExtent(0)));
3242 const range_type rowRng(rowOffset, rowOffset + newNumRows);
3243
3244 wrapped_dual_view_type newView = takeSubview(X.view_, rowRng, ALL());
3245
3246 // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3247 // handling subviews of degenerate Views quite so well. For some
3248 // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3249 // 0. We work around by creating a new empty DualView of the
3250 // desired (degenerate) dimensions.
3251 if (newView.extent(0) == 0 &&
3252 newView.extent(1) != X.view_.extent(1)) {
3253 newView =
3254 allocDualView<Scalar, LO, GO, Node>(0, X.getNumVectors());
3255 }
3256
3257 MV subViewMV = X.isConstantStride() ? MV(subMap, newView) : MV(subMap, newView, X.whichVectors_());
3258
3259 if (debug) {
3260 const LO lclNumRowsRet = static_cast<LO>(subViewMV.getLocalLength());
3261 const LO numColsRet = static_cast<LO>(subViewMV.getNumVectors());
3262 if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3263 lclGood = 0;
3264 if (errStrm.get() == nullptr) {
3265 errStrm = std::unique_ptr<std::ostringstream>(new std::ostringstream);
3266 }
3267 *errStrm << " Proc " << myRank << ": subMap.getLocalNumElements(): " << newNumRows << ", subViewMV.getLocalLength(): " << lclNumRowsRet << ", X.getNumVectors(): " << numCols << ", subViewMV.getNumVectors(): " << numColsRet << endl;
3268 }
3270 if (gblGood != 1) {
3271 std::ostringstream gblErrStrm;
3272 if (myRank == 0) {
3273 gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3274 "dimensions on one or more processes:"
3275 << endl;
3276 }
3277 const std::string myErrStr =
3278 errStrm.get() != nullptr ? errStrm->str() : std::string("");
3280 gblErrStrm << suffix << endl;
3281 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, gblErrStrm.str());
3282 }
3283 }
3284
3285 if (verbose) {
3286 std::ostringstream os;
3287 os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3288 std::cerr << os.str();
3289 }
3290
3291 *this = subViewMV; // shallow copy
3292
3293 if (verbose) {
3294 std::ostringstream os;
3295 os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3296 std::cerr << os.str();
3297 }
3298}
3299
3300template <class Scalar, class LO, class GO, class Node>
3303 const map_type& subMap,
3304 const size_t rowOffset)
3305 : MultiVector(X, Teuchos::RCP<const map_type>(new map_type(subMap)),
3307
3308template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3309Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3311 offsetView(const Teuchos::RCP<const map_type>& subMap,
3312 const size_t offset) const {
3314 return Teuchos::rcp(new MV(*this, *subMap, offset));
3315}
3316
3317template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3318Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3320 offsetViewNonConst(const Teuchos::RCP<const map_type>& subMap,
3321 const size_t offset) {
3323 return Teuchos::rcp(new MV(*this, *subMap, offset));
3324}
3325
3326template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3327Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3329 subView(const Teuchos::ArrayView<const size_t>& cols) const {
3330 using Teuchos::Array;
3331 using Teuchos::rcp;
3333
3334 const size_t numViewCols = static_cast<size_t>(cols.size());
3336 numViewCols < 1, std::runtime_error,
3337 "Tpetra::MultiVector::subView"
3338 "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3339 "contain at least one entry, but cols.size() = "
3340 << cols.size()
3341 << " == 0.");
3342
3343 // Check whether the index set in cols is contiguous. If it is,
3344 // use the more efficient Range1D version of subView.
3345 bool contiguous = true;
3346 for (size_t j = 1; j < numViewCols; ++j) {
3347 if (cols[j] != cols[j - 1] + static_cast<size_t>(1)) {
3348 contiguous = false;
3349 break;
3350 }
3351 }
3352 if (contiguous) {
3353 if (numViewCols == 0) {
3354 // The output MV has no columns, so there is nothing to view.
3355 return rcp(new MV(this->getMap(), numViewCols));
3356 } else {
3357 // Use the more efficient contiguous-index-range version.
3358 return this->subView(Teuchos::Range1D(cols[0], cols[numViewCols - 1]));
3359 }
3360 }
3361
3362 if (isConstantStride()) {
3363 return rcp(new MV(this->getMap(), view_, cols));
3364 } else {
3365 Array<size_t> newcols(cols.size());
3366 for (size_t j = 0; j < numViewCols; ++j) {
3367 newcols[j] = whichVectors_[cols[j]];
3368 }
3369 return rcp(new MV(this->getMap(), view_, newcols()));
3370 }
3371}
3372
3373template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3374Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3376 subView(const Teuchos::Range1D& colRng) const {
3377 using Kokkos::ALL;
3378 using Kokkos::subview;
3379 using Teuchos::Array;
3380 using Teuchos::RCP;
3381 using Teuchos::rcp;
3382 using ::Tpetra::Details::Behavior;
3384 const char tfecfFuncName[] = "subView(Range1D): ";
3385
3386 const size_t lclNumRows = this->getLocalLength();
3387 const size_t numVecs = this->getNumVectors();
3388 // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3389 // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3390 // "at least one vector.");
3392 static_cast<size_t>(colRng.size()) > numVecs, std::runtime_error,
3393 "colRng.size() = " << colRng.size() << " > this->getNumVectors() = "
3394 << numVecs << ".");
3396 numVecs != 0 && colRng.size() != 0 &&
3397 (colRng.lbound() < static_cast<Teuchos::Ordinal>(0) ||
3398 static_cast<size_t>(colRng.ubound()) >= numVecs),
3399 std::invalid_argument, "Nonempty input range [" << colRng.lbound() << "," << colRng.ubound() << "] exceeds the valid range of column indices "
3400 "[0, "
3401 << numVecs << "].");
3402
3403 RCP<const MV> X_ret; // the MultiVector subview to return
3404
3405 // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3406 // broken for the case of views with zero rows. I will brutally
3407 // enforce that the subview has the correct dimensions. In
3408 // particular, in the case of zero rows, I will, if necessary,
3409 // create a new dual_view_type with zero rows and the correct
3410 // number of columns. In a debug build, I will use an all-reduce
3411 // to ensure that it has the correct dimensions on all processes.
3412
3413 const std::pair<size_t, size_t> rows(0, lclNumRows);
3414 if (colRng.size() == 0) {
3415 const std::pair<size_t, size_t> cols(0, 0); // empty range
3416 wrapped_dual_view_type X_sub = takeSubview(this->view_, ALL(), cols);
3417 X_ret = rcp(new MV(this->getMap(), X_sub));
3418 } else {
3419 // Returned MultiVector is constant stride only if *this is.
3420 if (isConstantStride()) {
3421 const std::pair<size_t, size_t> cols(colRng.lbound(),
3422 colRng.ubound() + 1);
3423 wrapped_dual_view_type X_sub = takeSubview(this->view_, ALL(), cols);
3424 X_ret = rcp(new MV(this->getMap(), X_sub));
3425 } else {
3426 if (static_cast<size_t>(colRng.size()) == static_cast<size_t>(1)) {
3427 // We're only asking for one column, so the result does have
3428 // constant stride, even though this MultiVector does not.
3429 const std::pair<size_t, size_t> col(whichVectors_[0] + colRng.lbound(),
3430 whichVectors_[0] + colRng.ubound() + 1);
3432 X_ret = rcp(new MV(this->getMap(), X_sub));
3433 } else {
3434 Array<size_t> which(whichVectors_.begin() + colRng.lbound(),
3435 whichVectors_.begin() + colRng.ubound() + 1);
3436 X_ret = rcp(new MV(this->getMap(), view_, which));
3437 }
3438 }
3439 }
3440
3441 const bool debug = Behavior::debug();
3442 if (debug) {
3443 using Teuchos::Comm;
3444 using Teuchos::outArg;
3445 using Teuchos::REDUCE_MIN;
3446 using Teuchos::reduceAll;
3447
3448 RCP<const Comm<int> > comm = this->getMap().is_null() ? Teuchos::null : this->getMap()->getComm();
3449 if (!comm.is_null()) {
3450 int lclSuccess = 1;
3451 int gblSuccess = 1;
3452
3453 if (X_ret.is_null()) {
3454 lclSuccess = 0;
3455 }
3458 "X_ret (the subview of this "
3459 "MultiVector; the return value of this method) is null on some MPI "
3460 "process in this MultiVector's communicator. This should never "
3461 "happen. Please report this bug to the Tpetra developers.");
3462 if (!X_ret.is_null() &&
3463 X_ret->getNumVectors() != static_cast<size_t>(colRng.size())) {
3464 lclSuccess = 0;
3465 }
3469 "X_ret->getNumVectors() != "
3470 "colRng.size(), on at least one MPI process in this MultiVector's "
3471 "communicator. This should never happen. "
3472 "Please report this bug to the Tpetra developers.");
3473 }
3474 }
3475 return X_ret;
3476}
3477
3478template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3479Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3481 subViewNonConst(const Teuchos::ArrayView<const size_t>& cols) {
3483 return Teuchos::rcp_const_cast<MV>(this->subView(cols));
3484}
3485
3486template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3487Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3489 subViewNonConst(const Teuchos::Range1D& colRng) {
3491 return Teuchos::rcp_const_cast<MV>(this->subView(colRng));
3492}
3493
3494template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3497 const size_t j)
3498 : base_type(X.getMap()) {
3499 using Kokkos::subview;
3500 typedef std::pair<size_t, size_t> range_type;
3501 const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3502
3503 const size_t numCols = X.getNumVectors();
3504 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(j >= numCols, std::invalid_argument, "Input index j (== " << j << ") exceeds valid column index range [0, " << numCols << " - 1].");
3505 const size_t jj = X.isConstantStride() ? static_cast<size_t>(j) : static_cast<size_t>(X.whichVectors_[j]);
3506 this->view_ = takeSubview(X.view_, Kokkos::ALL(), range_type(jj, jj + 1));
3507
3508 // mfh 31 Jul 2017: It would be unwise to execute concurrent
3509 // Export or Import operations with different subviews of a
3510 // MultiVector. Thus, it is safe to reuse communication buffers.
3511 // See #1560 discussion.
3512 //
3513 // We only need one column's worth of buffer for imports_ and
3514 // exports_. Taking subviews now ensures that their lengths will
3515 // be exactly what we need, so we won't have to resize them later.
3516 {
3517 const size_t newSize = X.imports_.extent(0) / numCols;
3518 const size_t offset = jj * newSize;
3519 auto device_view = subview(X.imports_.view_device(),
3520 range_type(offset, offset + newSize));
3521 auto host_view = subview(X.imports_.view_host(),
3522 range_type(offset, offset + newSize));
3523 this->imports_ = decltype(X.imports_)(device_view, host_view);
3524 }
3525 {
3526 const size_t newSize = X.exports_.extent(0) / numCols;
3527 const size_t offset = jj * newSize;
3528 auto device_view = subview(X.exports_.view_device(),
3529 range_type(offset, offset + newSize));
3530 auto host_view = subview(X.exports_.view_host(),
3531 range_type(offset, offset + newSize));
3532 this->exports_ = decltype(X.exports_)(device_view, host_view);
3533 }
3534 // These two DualViews already either have the right number of
3535 // entries, or zero entries. This means that we don't need to
3536 // resize them.
3537 this->numImportPacketsPerLID_ = X.numImportPacketsPerLID_;
3538 this->numExportPacketsPerLID_ = X.numExportPacketsPerLID_;
3539}
3540
3541template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3542Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3544 getVector(const size_t j) const {
3546 return Teuchos::rcp(new V(*this, j));
3547}
3548
3549template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3550Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3552 getVectorNonConst(const size_t j) {
3554 return Teuchos::rcp(new V(*this, j));
3555}
3556
3557template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3559 get1dCopy(const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const {
3560 using IST = impl_scalar_type;
3561 using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3562 Kokkos::HostSpace,
3563 Kokkos::MemoryUnmanaged>;
3564 const char tfecfFuncName[] = "get1dCopy: ";
3565
3566 const size_t numRows = this->getLocalLength();
3567 const size_t numCols = this->getNumVectors();
3568
3570 "LDA = " << LDA << " < numRows = " << numRows << ".");
3571 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numRows > size_t(0) && numCols > size_t(0) &&
3572 size_t(A.size()) < LDA * (numCols - 1) + numRows,
3573 std::runtime_error,
3574 "A.size() = " << A.size() << ", but its size must be at least "
3575 << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3576
3577 const std::pair<size_t, size_t> rowRange(0, numRows);
3578 const std::pair<size_t, size_t> colRange(0, numCols);
3579
3580 input_view_type A_view_orig(reinterpret_cast<IST*>(A.getRawPtr()),
3581 LDA, numCols);
3582 auto A_view = Kokkos::subview(A_view_orig, rowRange, colRange);
3583
3595 if (this->need_sync_host() && this->need_sync_device()) {
3598 throw std::runtime_error("Tpetra::MultiVector: A non-const view is alive outside and we cannot give a copy where host or device view can be modified outside");
3599 } else {
3600 const bool useHostView = view_.host_view_use_count() >= view_.device_view_use_count();
3601 if (this->isConstantStride()) {
3602 if (useHostView) {
3603 auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3604 // DEEP_COPY REVIEW - NOT TESTED
3605 Kokkos::deep_copy(A_view, srcView_host);
3606 } else {
3607 auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3608 // DEEP_COPY REVIEW - NOT TESTED
3609 Kokkos::deep_copy(A_view, srcView_device);
3610 }
3611 } else {
3612 for (size_t j = 0; j < numCols; ++j) {
3613 const size_t srcCol = this->whichVectors_[j];
3614 auto dstColView = Kokkos::subview(A_view, rowRange, j);
3615
3616 if (useHostView) {
3617 auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3618 auto srcColView_host = Kokkos::subview(srcView_host, rowRange, srcCol);
3619 // DEEP_COPY REVIEW - NOT TESTED
3620 Kokkos::deep_copy(dstColView, srcColView_host);
3621 } else {
3622 auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3623 auto srcColView_device = Kokkos::subview(srcView_device, rowRange, srcCol);
3624 // DEEP_COPY REVIEW - NOT TESTED
3625 Kokkos::deep_copy(dstColView, srcColView_device);
3626 }
3627 }
3628 }
3629 }
3630}
3631
3632template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3634 get2dCopy(const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const {
3636 const char tfecfFuncName[] = "get2dCopy: ";
3637 const size_t numRows = this->getLocalLength();
3638 const size_t numCols = this->getNumVectors();
3639
3641 static_cast<size_t>(ArrayOfPtrs.size()) != numCols,
3642 std::runtime_error,
3643 "Input array of pointers must contain as many "
3644 "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3645 << ArrayOfPtrs.size() << " != getNumVectors() = " << numCols << ".");
3646
3647 if (numRows != 0 && numCols != 0) {
3648 // No side effects until we've validated the input.
3649 for (size_t j = 0; j < numCols; ++j) {
3650 const size_t dstLen = static_cast<size_t>(ArrayOfPtrs[j].size());
3652 dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3653 "the input array of arrays is not long enough to fit all entries in "
3654 "that column of the MultiVector. ArrayOfPtrs[j].size() = "
3655 << dstLen << " < getLocalLength() = " << numRows << ".");
3656 }
3657
3658 // We've validated the input, so it's safe to start copying.
3659 for (size_t j = 0; j < numCols; ++j) {
3660 Teuchos::RCP<const V> X_j = this->getVector(j);
3661 const size_t LDA = static_cast<size_t>(ArrayOfPtrs[j].size());
3662 X_j->get1dCopy(ArrayOfPtrs[j], LDA);
3663 }
3664 }
3665}
3666
3667template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3668Teuchos::ArrayRCP<const Scalar>
3670 get1dView() const {
3671 if (getLocalLength() == 0 || getNumVectors() == 0) {
3672 return Teuchos::null;
3673 } else {
3675 !isConstantStride(), std::runtime_error,
3676 "Tpetra::MultiVector::"
3677 "get1dView: This MultiVector does not have constant stride, so it is "
3678 "not possible to view its data as a single array. You may check "
3679 "whether a MultiVector has constant stride by calling "
3680 "isConstantStride().");
3681 // Since get1dView() is and was always marked const, I have to
3682 // cast away const here in order not to break backwards
3683 // compatibility.
3684 auto X_lcl = getLocalViewHost(Access::ReadOnly);
3685 Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3686 Kokkos::Compat::persistingView(X_lcl);
3687 return Teuchos::arcp_reinterpret_cast<const Scalar>(dataAsArcp);
3688 }
3689}
3690
3691template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3692Teuchos::ArrayRCP<Scalar>
3695 if (this->getLocalLength() == 0 || this->getNumVectors() == 0) {
3696 return Teuchos::null;
3697 } else {
3698 TEUCHOS_TEST_FOR_EXCEPTION(!isConstantStride(), std::runtime_error,
3699 "Tpetra::MultiVector::"
3700 "get1dViewNonConst: This MultiVector does not have constant stride, "
3701 "so it is not possible to view its data as a single array. You may "
3702 "check whether a MultiVector has constant stride by calling "
3703 "isConstantStride().");
3704 auto X_lcl = getLocalViewHost(Access::ReadWrite);
3705 Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3706 Kokkos::Compat::persistingView(X_lcl);
3707 return Teuchos::arcp_reinterpret_cast<Scalar>(dataAsArcp);
3708 }
3709}
3710
3711template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3712Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3715 auto X_lcl = getLocalViewHost(Access::ReadWrite);
3716
3717 // Don't use the row range here on the outside, in order to avoid
3718 // a strided return type (in case Kokkos::subview is conservative
3719 // about that). Instead, use the row range for the column views
3720 // in the loop.
3721 const size_t myNumRows = this->getLocalLength();
3722 const size_t numCols = this->getNumVectors();
3723 const Kokkos::pair<size_t, size_t> rowRange(0, myNumRows);
3724
3725 Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views(numCols);
3726 for (size_t j = 0; j < numCols; ++j) {
3727 const size_t col = this->isConstantStride() ? j : this->whichVectors_[j];
3728 auto X_lcl_j = Kokkos::subview(X_lcl, rowRange, col);
3729 Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3730 Kokkos::Compat::persistingView(X_lcl_j);
3731 views[j] = Teuchos::arcp_reinterpret_cast<Scalar>(X_lcl_j_arcp);
3732 }
3733 return views;
3734}
3735
3736template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3739 getLocalViewHost(Access::ReadOnlyStruct s) const {
3740 return view_.getHostView(s);
3741}
3742
3743template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3746 getLocalViewHost(Access::ReadWriteStruct s) {
3747 return view_.getHostView(s);
3748}
3749
3750template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3753 getLocalViewHost(Access::OverwriteAllStruct s) {
3754 return view_.getHostView(s);
3755}
3756
3757template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3760 getLocalViewDevice(Access::ReadOnlyStruct s) const {
3761 return view_.getDeviceView(s);
3762}
3763
3764template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3767 getLocalViewDevice(Access::ReadWriteStruct s) {
3768 return view_.getDeviceView(s);
3769}
3770
3771template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3774 getLocalViewDevice(Access::OverwriteAllStruct s) {
3775 return view_.getDeviceView(s);
3776}
3777
3778template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3784
3785template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3786Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3788 get2dView() const {
3789 // Since get2dView() is and was always marked const, I have to
3790 // cast away const here in order not to break backwards
3791 // compatibility.
3792 auto X_lcl = getLocalViewHost(Access::ReadOnly);
3793
3794 // Don't use the row range here on the outside, in order to avoid
3795 // a strided return type (in case Kokkos::subview is conservative
3796 // about that). Instead, use the row range for the column views
3797 // in the loop.
3798 const size_t myNumRows = this->getLocalLength();
3799 const size_t numCols = this->getNumVectors();
3800 const Kokkos::pair<size_t, size_t> rowRange(0, myNumRows);
3801
3802 Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views(numCols);
3803 for (size_t j = 0; j < numCols; ++j) {
3804 const size_t col = this->isConstantStride() ? j : this->whichVectors_[j];
3805 auto X_lcl_j = Kokkos::subview(X_lcl, rowRange, col);
3806 Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
3807 Kokkos::Compat::persistingView(X_lcl_j);
3808 views[j] = Teuchos::arcp_reinterpret_cast<const Scalar>(X_lcl_j_arcp);
3809 }
3810 return views;
3811}
3812
3813template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3815 multiply(Teuchos::ETransp transA,
3816 Teuchos::ETransp transB,
3817 const Scalar& alpha,
3820 const Scalar& beta) {
3821 using std::endl;
3822 using Teuchos::CONJ_TRANS;
3823 using Teuchos::NO_TRANS;
3824 using Teuchos::RCP;
3825 using Teuchos::rcp;
3826 using Teuchos::rcpFromRef;
3827 using Teuchos::TRANS;
3828 using ::Tpetra::Details::ProfilingRegion;
3829#if KOKKOS_VERSION >= 40799
3830 using ATS = KokkosKernels::ArithTraits<impl_scalar_type>;
3831#else
3832 using ATS = Kokkos::ArithTraits<impl_scalar_type>;
3833#endif
3834 using LO = local_ordinal_type;
3835 using STS = Teuchos::ScalarTraits<Scalar>;
3837 const char tfecfFuncName[] = "multiply: ";
3838 ProfilingRegion region("Tpetra::MV::multiply");
3839
3840 // This routine performs a variety of matrix-matrix multiply
3841 // operations, interpreting the MultiVector (this-aka C , A and B)
3842 // as 2D matrices. Variations are due to the fact that A, B and C
3843 // can be locally replicated or globally distributed MultiVectors
3844 // and that we may or may not operate with the transpose of A and
3845 // B. Possible cases are:
3846 //
3847 // Operations # Cases Notes
3848 // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
3849 // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
3850 // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
3851 //
3852 // The following operations are not meaningful for 1-D
3853 // distributions:
3854 //
3855 // u1) C(local) = A^T(distr) * B^T(distr) 1
3856 // u2) C(local) = A (distr) * B^X(distr) 2
3857 // u3) C(distr) = A^X(local) * B^X(local) 4
3858 // u4) C(distr) = A^X(local) * B^X(distr) 4
3859 // u5) C(distr) = A^T(distr) * B^X(local) 2
3860 // u6) C(local) = A^X(distr) * B^X(local) 4
3861 // u7) C(distr) = A^X(distr) * B^X(local) 4
3862 // u8) C(local) = A^X(local) * B^X(distr) 4
3863 //
3864 // Total number of cases: 32 (= 2^5).
3865
3866 impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
3867
3868 const bool A_is_local = !A.isDistributed();
3869 const bool B_is_local = !B.isDistributed();
3870 const bool C_is_local = !this->isDistributed();
3871
3872 // In debug mode, check compatibility of local dimensions. We
3873 // only do this in debug mode, since it requires an all-reduce
3874 // to ensure correctness on all processses. It's entirely
3875 // possible that only some processes may have incompatible local
3876 // dimensions. Throwing an exception only on those processes
3877 // could cause this method to hang.
3878 const bool debug = ::Tpetra::Details::Behavior::debug();
3879 if (debug) {
3880 auto myMap = this->getMap();
3881 if (!myMap.is_null() && !myMap->getComm().is_null()) {
3882 using Teuchos::outArg;
3883 using Teuchos::REDUCE_MIN;
3884 using Teuchos::reduceAll;
3885
3886 auto comm = myMap->getComm();
3887 const size_t A_nrows =
3888 (transA != NO_TRANS) ? A.getNumVectors() : A.getLocalLength();
3889 const size_t A_ncols =
3890 (transA != NO_TRANS) ? A.getLocalLength() : A.getNumVectors();
3891 const size_t B_nrows =
3892 (transB != NO_TRANS) ? B.getNumVectors() : B.getLocalLength();
3893 const size_t B_ncols =
3894 (transB != NO_TRANS) ? B.getLocalLength() : B.getNumVectors();
3895
3896 const bool lclBad = this->getLocalLength() != A_nrows ||
3897 this->getNumVectors() != B_ncols || A_ncols != B_nrows;
3898
3899 const int myRank = comm->getRank();
3900 std::ostringstream errStrm;
3901 if (this->getLocalLength() != A_nrows) {
3902 errStrm << "Proc " << myRank << ": this->getLocalLength()="
3903 << this->getLocalLength() << " != A_nrows=" << A_nrows
3904 << "." << std::endl;
3905 }
3906 if (this->getNumVectors() != B_ncols) {
3907 errStrm << "Proc " << myRank << ": this->getNumVectors()="
3908 << this->getNumVectors() << " != B_ncols=" << B_ncols
3909 << "." << std::endl;
3910 }
3911 if (A_ncols != B_nrows) {
3912 errStrm << "Proc " << myRank << ": A_ncols="
3913 << A_ncols << " != B_nrows=" << B_nrows
3914 << "." << std::endl;
3915 }
3916
3917 const int lclGood = lclBad ? 0 : 1;
3918 int gblGood = 0;
3920 if (gblGood != 1) {
3921 std::ostringstream os;
3923
3924 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
3925 "Inconsistent local dimensions on at "
3926 "least one process in this object's communicator."
3927 << std::endl
3928 << "Operation: "
3929 << "C(" << (C_is_local ? "local" : "distr") << ") = "
3930 << alpha << "*A"
3931 << (transA == Teuchos::TRANS ? "^T" : (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3932 << "(" << (A_is_local ? "local" : "distr") << ") * "
3933 << beta << "*B"
3934 << (transA == Teuchos::TRANS ? "^T" : (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3935 << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
3936 << "Global dimensions: C(" << this->getGlobalLength() << ", "
3937 << this->getNumVectors() << "), A(" << A.getGlobalLength()
3938 << ", " << A.getNumVectors() << "), B(" << B.getGlobalLength()
3939 << ", " << B.getNumVectors() << ")." << std::endl
3940 << os.str());
3941 }
3942 }
3943 }
3944
3945 // Case 1: C(local) = A^X(local) * B^X(local)
3946 const bool Case1 = C_is_local && A_is_local && B_is_local;
3947 // Case 2: C(local) = A^T(distr) * B (distr)
3948 const bool Case2 = C_is_local && !A_is_local && !B_is_local &&
3949 transA != NO_TRANS &&
3950 transB == NO_TRANS;
3951 // Case 3: C(distr) = A (distr) * B^X(local)
3952 const bool Case3 = !C_is_local && !A_is_local && B_is_local &&
3953 transA == NO_TRANS;
3954
3955 // Test that we are considering a meaningful case
3956 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!Case1 && !Case2 && !Case3, std::runtime_error,
3957 "Multiplication of op(A) and op(B) into *this is not a "
3958 "supported use case.");
3959
3960 if (beta != STS::zero() && Case2) {
3961 // If Case2, then C is local and contributions must be summed
3962 // across all processes. However, if beta != 0, then accumulate
3963 // beta*C into the sum. When summing across all processes, we
3964 // only want to accumulate this once, so set beta == 0 on all
3965 // processes except Process 0.
3966 const int myRank = this->getMap()->getComm()->getRank();
3967 if (myRank != 0) {
3968 beta_local = ATS::zero();
3969 }
3970 }
3971
3972 // We only know how to do matrix-matrix multiplies if all the
3973 // MultiVectors have constant stride. If not, we have to make
3974 // temporary copies of those MultiVectors (including possibly
3975 // *this) that don't have constant stride.
3976 RCP<MV> C_tmp;
3977 if (!isConstantStride()) {
3978 C_tmp = rcp(new MV(*this, Teuchos::Copy)); // deep copy
3979 } else {
3980 C_tmp = rcp(this, false);
3981 }
3982
3984 if (!A.isConstantStride()) {
3985 A_tmp = rcp(new MV(A, Teuchos::Copy)); // deep copy
3986 } else {
3987 A_tmp = rcpFromRef(A);
3988 }
3989
3991 if (!B.isConstantStride()) {
3992 B_tmp = rcp(new MV(B, Teuchos::Copy)); // deep copy
3993 } else {
3994 B_tmp = rcpFromRef(B);
3995 }
3996
3997 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!C_tmp->isConstantStride() || !B_tmp->isConstantStride() ||
3998 !A_tmp->isConstantStride(),
3999 std::logic_error,
4000 "Failed to make temporary constant-stride copies of MultiVectors.");
4001
4002 {
4003 const LO A_lclNumRows = A_tmp->getLocalLength();
4004 const LO A_numVecs = A_tmp->getNumVectors();
4005 auto A_lcl = A_tmp->getLocalViewDevice(Access::ReadOnly);
4006 auto A_sub = Kokkos::subview(A_lcl,
4007 std::make_pair(LO(0), A_lclNumRows),
4008 std::make_pair(LO(0), A_numVecs));
4009
4010 const LO B_lclNumRows = B_tmp->getLocalLength();
4011 const LO B_numVecs = B_tmp->getNumVectors();
4012 auto B_lcl = B_tmp->getLocalViewDevice(Access::ReadOnly);
4013 auto B_sub = Kokkos::subview(B_lcl,
4014 std::make_pair(LO(0), B_lclNumRows),
4015 std::make_pair(LO(0), B_numVecs));
4016
4017 const LO C_lclNumRows = C_tmp->getLocalLength();
4018 const LO C_numVecs = C_tmp->getNumVectors();
4019
4020 auto C_lcl = C_tmp->getLocalViewDevice(Access::ReadWrite);
4021 auto C_sub = Kokkos::subview(C_lcl,
4022 std::make_pair(LO(0), C_lclNumRows),
4023 std::make_pair(LO(0), C_numVecs));
4024 const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' : (transA == Teuchos::TRANS ? 'T' : 'C'));
4025 const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' : (transB == Teuchos::TRANS ? 'T' : 'C'));
4027
4028 ProfilingRegion regionGemm("Tpetra::MV::multiply-call-gemm");
4029
4030 KokkosBlas::gemm(&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
4031 beta_local, C_sub);
4032 }
4033
4034 if (!isConstantStride()) {
4035 ::Tpetra::deep_copy(*this, *C_tmp); // Copy the result back into *this.
4036 }
4037
4038 // Dispose of (possibly) extra copies of A and B.
4039 A_tmp = Teuchos::null;
4040 B_tmp = Teuchos::null;
4041
4042 // If Case 2 then sum up *this and distribute it to all processes.
4043 if (Case2) {
4044 this->reduce();
4045 }
4046}
4047
4048template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4054 using Kokkos::ALL;
4055 using Kokkos::subview;
4056 const char tfecfFuncName[] = "elementWiseMultiply: ";
4057
4058 const size_t lclNumRows = this->getLocalLength();
4059 const size_t numVecs = this->getNumVectors();
4060
4061 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclNumRows != A.getLocalLength() || lclNumRows != B.getLocalLength(),
4062 std::runtime_error, "MultiVectors do not have the same local length.");
4064 numVecs != B.getNumVectors(), std::runtime_error,
4065 "this->getNumVectors"
4066 "() = "
4067 << numVecs << " != B.getNumVectors() = " << B.getNumVectors()
4068 << ".");
4069
4070 auto this_view = this->getLocalViewDevice(Access::ReadWrite);
4071 auto A_view = A.getLocalViewDevice(Access::ReadOnly);
4072 auto B_view = B.getLocalViewDevice(Access::ReadOnly);
4073
4074 if (isConstantStride() && B.isConstantStride()) {
4075 // A is just a Vector; it only has one column, so it always has
4076 // constant stride.
4077 //
4078 // If both *this and B have constant stride, we can do an
4079 // element-wise multiply on all columns at once.
4080 KokkosBlas::mult(scalarThis,
4081 this_view,
4082 scalarAB,
4083 subview(A_view, ALL(), 0),
4084 B_view);
4085 } else {
4086 for (size_t j = 0; j < numVecs; ++j) {
4087 const size_t C_col = isConstantStride() ? j : whichVectors_[j];
4088 const size_t B_col = B.isConstantStride() ? j : B.whichVectors_[j];
4089 KokkosBlas::mult(scalarThis,
4091 scalarAB,
4092 subview(A_view, ALL(), 0),
4093 subview(B_view, ALL(), B_col));
4094 }
4095 }
4096}
4097
4098template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4100 reduce() {
4101 using ::Tpetra::Details::allReduceView;
4102 using ::Tpetra::Details::ProfilingRegion;
4103 ProfilingRegion region("Tpetra::MV::reduce");
4104
4105 const auto map = this->getMap();
4106 if (map.get() == nullptr) {
4107 return;
4108 }
4109 const auto comm = map->getComm();
4110 if (comm.get() == nullptr) {
4111 return;
4112 }
4113
4114 // Avoid giving device buffers to MPI. Even if MPI handles them
4115 // correctly, doing so may not perform well.
4116 const bool changed_on_device = this->need_sync_host();
4117 if (changed_on_device) {
4118 // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4119 // and host will be separate allocations. In that case, it may
4120 // pay to do the all-reduce from device to host.
4121 Kokkos::fence("MultiVector::reduce"); // for UVM getLocalViewDevice is UVM which can be read as host by allReduceView, so we must not read until device is fenced
4122 auto X_lcl = this->getLocalViewDevice(Access::ReadWrite);
4123 allReduceView(X_lcl, X_lcl, *comm);
4124 } else {
4125 auto X_lcl = this->getLocalViewHost(Access::ReadWrite);
4126 allReduceView(X_lcl, X_lcl, *comm);
4127 }
4128}
4129
4130template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4133 const size_t col,
4136 const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4137 const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4140 std::runtime_error,
4141 "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4142 << " is invalid. The range of valid row indices on this process "
4143 << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4144 << ", " << maxLocalIndex << "].");
4146 vectorIndexOutOfRange(col),
4147 std::runtime_error,
4148 "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4149 << " of the multivector is invalid.");
4150 }
4151
4152 auto view = this->getLocalViewHost(Access::ReadWrite);
4153 const size_t colInd = isConstantStride() ? col : whichVectors_[col];
4154 view(lclRow, colInd) = ScalarValue;
4155}
4156
4157template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4160 const size_t col,
4161 const impl_scalar_type& value,
4162 const bool atomic) {
4164 const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4165 const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4168 std::runtime_error,
4169 "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4170 << " is invalid. The range of valid row indices on this process "
4171 << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4172 << ", " << maxLocalIndex << "].");
4174 vectorIndexOutOfRange(col),
4175 std::runtime_error,
4176 "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4177 << " of the multivector is invalid.");
4178 }
4179
4180 const size_t colInd = isConstantStride() ? col : whichVectors_[col];
4181
4182 auto view = this->getLocalViewHost(Access::ReadWrite);
4183 if (atomic) {
4184 Kokkos::atomic_add(&(view(lclRow, colInd)), value);
4185 } else {
4186 view(lclRow, colInd) += value;
4187 }
4188}
4189
4190template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4193 const size_t col,
4195 // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4196 // touches the RCP's reference count, which isn't thread safe.
4197 const LocalOrdinal lclRow = this->map_->getLocalElement(gblRow);
4198
4200 const char tfecfFuncName[] = "replaceGlobalValue: ";
4201 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid(),
4202 std::runtime_error,
4203 "Global row index " << gblRow << " is not present on this process "
4204 << this->getMap()->getComm()->getRank() << ".");
4205 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->vectorIndexOutOfRange(col), std::runtime_error,
4206 "Vector index " << col << " of the MultiVector is invalid.");
4207 }
4208
4209 this->replaceLocalValue(lclRow, col, ScalarValue);
4210}
4211
4212template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4215 const size_t col,
4216 const impl_scalar_type& value,
4217 const bool atomic) {
4218 // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4219 // touches the RCP's reference count, which isn't thread safe.
4220 const LocalOrdinal lclRow = this->map_->getLocalElement(globalRow);
4221
4224 lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid(),
4225 std::runtime_error,
4226 "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4227 << " is not present on this process "
4228 << this->getMap()->getComm()->getRank() << ".");
4230 vectorIndexOutOfRange(col),
4231 std::runtime_error,
4232 "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4233 << " of the multivector is invalid.");
4234 }
4235
4236 this->sumIntoLocalValue(lclRow, col, value, atomic);
4237}
4238
4239template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4240template <class T>
4241Teuchos::ArrayRCP<T>
4243 getSubArrayRCP(Teuchos::ArrayRCP<T> arr,
4244 size_t j) const {
4245 typedef Kokkos::DualView<impl_scalar_type*,
4246 typename dual_view_type::array_layout,
4249 const size_t col = isConstantStride() ? j : whichVectors_[j];
4251 Kokkos::subview(view_, Kokkos::ALL(), col);
4252 return Kokkos::Compat::persistingView(X_col.view_device());
4253}
4254
4255template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4257 need_sync_host() const {
4258 return view_.need_sync_host();
4259}
4260
4261template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4263 need_sync_device() const {
4264 return view_.need_sync_device();
4265}
4266
4267template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4268std::string
4270 descriptionImpl(const std::string& className) const {
4271 using Teuchos::TypeNameTraits;
4272
4273 std::ostringstream out;
4274 out << "\"" << className << "\": {";
4275 out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name()
4276 << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name()
4277 << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name()
4278 << ", Node" << Node::name()
4279 << "}, ";
4280 if (this->getObjectLabel() != "") {
4281 out << "Label: \"" << this->getObjectLabel() << "\", ";
4282 }
4283 out << ", numRows: " << this->getGlobalLength();
4284 if (className != "Tpetra::Vector") {
4285 out << ", numCols: " << this->getNumVectors()
4286 << ", isConstantStride: " << this->isConstantStride();
4287 }
4288 if (this->isConstantStride()) {
4289 out << ", columnStride: " << this->getStride();
4290 }
4291 out << "}";
4292
4293 return out.str();
4294}
4295
4296template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4297std::string
4299 description() const {
4300 return this->descriptionImpl("Tpetra::MultiVector");
4301}
4302
4303namespace Details {
4304template <typename ViewType>
4305void print_vector(Teuchos::FancyOStream& out, const char* prefix, const ViewType& v) {
4306 using std::endl;
4307 static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename ViewType::memory_space>::accessible,
4308 "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a host-accessible view.");
4309 static_assert(ViewType::rank == 2,
4310 "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a rank-2 view.");
4311 // The square braces [] and their contents are in Matlab
4312 // format, so users may copy and paste directly into Matlab.
4313 out << "Values(" << prefix << "): " << std::endl
4314 << "[";
4315 const size_t numRows = v.extent(0);
4316 const size_t numCols = v.extent(1);
4317 if (numCols == 1) {
4318 for (size_t i = 0; i < numRows; ++i) {
4319 out << v(i, 0);
4320 if (i + 1 < numRows) {
4321 out << "; ";
4322 }
4323 }
4324 } else {
4325 for (size_t i = 0; i < numRows; ++i) {
4326 for (size_t j = 0; j < numCols; ++j) {
4327 out << v(i, j);
4328 if (j + 1 < numCols) {
4329 out << ", ";
4330 }
4331 }
4332 if (i + 1 < numRows) {
4333 out << "; ";
4334 }
4335 }
4336 }
4337 out << "]" << endl;
4338}
4339} // namespace Details
4340
4341template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4342std::string
4344 localDescribeToString(const Teuchos::EVerbosityLevel vl) const {
4345 typedef LocalOrdinal LO;
4346 using std::endl;
4347
4348 if (vl <= Teuchos::VERB_LOW) {
4349 return std::string();
4350 }
4351 auto map = this->getMap();
4352 if (map.is_null()) {
4353 return std::string();
4354 }
4355 auto outStringP = Teuchos::rcp(new std::ostringstream());
4356 auto outp = Teuchos::getFancyOStream(outStringP);
4357 Teuchos::FancyOStream& out = *outp;
4358 auto comm = map->getComm();
4359 const int myRank = comm->getRank();
4360 const int numProcs = comm->getSize();
4361
4362 out << "Process " << myRank << " of " << numProcs << ":" << endl;
4363 Teuchos::OSTab tab1(out);
4364
4365 // VERB_MEDIUM and higher prints getLocalLength()
4366 const LO lclNumRows = static_cast<LO>(this->getLocalLength());
4367 out << "Local number of rows: " << lclNumRows << endl;
4368
4369 if (vl > Teuchos::VERB_MEDIUM) {
4370 // VERB_HIGH and higher prints isConstantStride() and getStride().
4371 // The first is only relevant if the Vector has multiple columns.
4372 if (this->getNumVectors() != static_cast<size_t>(1)) {
4373 out << "isConstantStride: " << this->isConstantStride() << endl;
4374 }
4375 if (this->isConstantStride()) {
4376 out << "Column stride: " << this->getStride() << endl;
4377 }
4378
4379 if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4380 // VERB_EXTREME prints values. If we're not doing univied memory, we'll
4382 // so we can't use our regular accessor functins
4383
4384 // NOTE: This is an occasion where we do *not* want the auto-sync stuff
4385 // to trigger (since this function is conceptually const). Thus, we
4386 // get *copies* of the view's data instead.
4387 auto X_dev = view_.getDeviceCopy();
4388 auto X_host = view_.getHostCopy();
4389
4390 if (X_dev.data() == X_host.data()) {
4391 // One single allocation
4392 Details::print_vector(out, "unified", X_host);
4393 } else {
4394 Details::print_vector(out, "host", X_host);
4395 Details::print_vector(out, "dev", X_dev);
4396 }
4397 }
4398 }
4399 out.flush(); // make sure the ostringstream got everything
4400 return outStringP->str();
4401}
4402
4403template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4405 describeImpl(Teuchos::FancyOStream& out,
4406 const std::string& className,
4407 const Teuchos::EVerbosityLevel verbLevel) const {
4408 using std::endl;
4409 using Teuchos::TypeNameTraits;
4410 using Teuchos::VERB_DEFAULT;
4411 using Teuchos::VERB_LOW;
4412 using Teuchos::VERB_NONE;
4413 const Teuchos::EVerbosityLevel vl =
4415
4416 if (vl == VERB_NONE) {
4417 return; // don't print anything
4418 }
4419 // If this Vector's Comm is null, then the Vector does not
4420 // participate in collective operations with the other processes.
4421 // In that case, it is not even legal to call this method. The
4422 // reasonable thing to do in that case is nothing.
4423 auto map = this->getMap();
4424 if (map.is_null()) {
4425 return;
4426 }
4427 auto comm = map->getComm();
4428 if (comm.is_null()) {
4429 return;
4430 }
4431
4432 const int myRank = comm->getRank();
4433
4434 // Only Process 0 should touch the output stream, but this method
4435 // in general may need to do communication. Thus, we may need to
4436 // preserve the current tab level across multiple "if (myRank ==
4437 // 0) { ... }" inner scopes. This is why we sometimes create
4438 // OSTab instances by pointer, instead of by value. We only need
4439 // to create them by pointer if the tab level must persist through
4440 // multiple inner scopes.
4441 Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4442
4443 // VERB_LOW and higher prints the equivalent of description().
4444 if (myRank == 0) {
4445 tab0 = Teuchos::rcp(new Teuchos::OSTab(out));
4446 out << "\"" << className << "\":" << endl;
4447 tab1 = Teuchos::rcp(new Teuchos::OSTab(out));
4448 {
4449 out << "Template parameters:" << endl;
4450 Teuchos::OSTab tab2(out);
4451 out << "Scalar: " << TypeNameTraits<Scalar>::name() << endl
4452 << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name() << endl
4453 << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name() << endl
4454 << "Node: " << Node::name() << endl;
4455 }
4456 if (this->getObjectLabel() != "") {
4457 out << "Label: \"" << this->getObjectLabel() << "\", ";
4458 }
4459 out << "Global number of rows: " << this->getGlobalLength() << endl;
4460 if (className != "Tpetra::Vector") {
4461 out << "Number of columns: " << this->getNumVectors() << endl;
4462 }
4463 // getStride() may differ on different processes, so it (and
4464 // isConstantStride()) properly belong to per-process data.
4465 }
4466
4467 // This is collective over the Map's communicator.
4468 if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4469 const std::string lclStr = this->localDescribeToString(vl);
4471 }
4472}
4473
4474template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4476 describe(Teuchos::FancyOStream& out,
4477 const Teuchos::EVerbosityLevel verbLevel) const {
4478 this->describeImpl(out, "Tpetra::MultiVector", verbLevel);
4479}
4480
4481template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4483 removeEmptyProcessesInPlace(const Teuchos::RCP<const map_type>& newMap) {
4484 replaceMap(newMap);
4485}
4486
4487template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4490 using ::Tpetra::Details::localDeepCopy;
4491 const char prefix[] = "Tpetra::MultiVector::assign: ";
4492
4493 TEUCHOS_TEST_FOR_EXCEPTION(this->getGlobalLength() != src.getGlobalLength() ||
4494 this->getNumVectors() != src.getNumVectors(),
4495 std::invalid_argument,
4496 prefix << "Global dimensions of the two Tpetra::MultiVector "
4497 "objects do not match. src has dimensions ["
4498 << src.getGlobalLength()
4499 << "," << src.getNumVectors() << "], and *this has dimensions ["
4500 << this->getGlobalLength() << "," << this->getNumVectors() << "].");
4501 // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4502 TEUCHOS_TEST_FOR_EXCEPTION(this->getLocalLength() != src.getLocalLength(), std::invalid_argument,
4503 prefix << "The local row counts of the two Tpetra::MultiVector "
4504 "objects do not match. src has "
4505 << src.getLocalLength() << " row(s) "
4506 << " and *this has " << this->getLocalLength() << " row(s).");
4507
4508 // See #1510. We're writing to *this, so we don't care about its
4509 // contents in either memory space, and we don't want
4510 // DualView::modify to complain about "concurrent modification" of
4511 // host and device Views.
4512
4513 // KJ: this is problematic. assign funtion is used to construct a subvector
4514 // if the sync flag is reset here, it lose all our control over getLocalView interface
4515 // this->clear_sync_state();
4516
4517 // If need sync to device, then host has most recent version.
4518 const bool src_last_updated_on_host = src.need_sync_device();
4519
4521 localDeepCopy(this->getLocalViewHost(Access::ReadWrite),
4522 src.getLocalViewHost(Access::ReadOnly),
4523 this->isConstantStride(),
4524 src.isConstantStride(),
4525 this->whichVectors_(),
4526 src.whichVectors_());
4527 } else {
4528 localDeepCopy(this->getLocalViewDevice(Access::ReadWrite),
4529 src.getLocalViewDevice(Access::ReadOnly),
4530 this->isConstantStride(),
4531 src.isConstantStride(),
4532 this->whichVectors_(),
4533 src.whichVectors_());
4534 }
4535}
4536
4537template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4538template <class T>
4539Teuchos::RCP<MultiVector<T, LocalOrdinal, GlobalOrdinal, Node> >
4541 convert() const {
4542 using Teuchos::RCP;
4543
4544 auto newMV = Teuchos::rcp(new MultiVector<T, LocalOrdinal, GlobalOrdinal, Node>(this->getMap(),
4545 this->getNumVectors(),
4546 /*zeroOut=*/false));
4547 Tpetra::deep_copy(*newMV, *this);
4548 return newMV;
4549}
4550
4551template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4554 using ::Tpetra::Details::PackTraits;
4555
4556 const size_t l1 = this->getLocalLength();
4557 const size_t l2 = vec.getLocalLength();
4558 if ((l1 != l2) || (this->getNumVectors() != vec.getNumVectors())) {
4559 return false;
4560 }
4561
4562 return true;
4563}
4564
4565template <class ST, class LO, class GO, class NT>
4568 std::swap(mv.map_, this->map_);
4569 std::swap(mv.view_, this->view_);
4570 std::swap(mv.whichVectors_, this->whichVectors_);
4571}
4572
4573#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4574template <class ST, class LO, class GO, class NT>
4576 const Teuchos::SerialDenseMatrix<int, ST>& src) {
4577 using ::Tpetra::Details::localDeepCopy;
4578 using MV = MultiVector<ST, LO, GO, NT>;
4579 using IST = typename MV::impl_scalar_type;
4580 using input_view_type =
4581 Kokkos::View<const IST**, Kokkos::LayoutLeft,
4582 Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4583 using pair_type = std::pair<LO, LO>;
4584
4585 const LO numRows = static_cast<LO>(src.numRows());
4586 const LO numCols = static_cast<LO>(src.numCols());
4587
4588 TEUCHOS_TEST_FOR_EXCEPTION(numRows != static_cast<LO>(dst.getLocalLength()) ||
4589 numCols != static_cast<LO>(dst.getNumVectors()),
4590 std::invalid_argument, "Tpetra::deep_copy: On Process " << dst.getMap()->getComm()->getRank() << ", dst is " << dst.getLocalLength() << " x " << dst.getNumVectors() << ", but src is " << numRows << " x " << numCols << ".");
4591
4592 const IST* src_raw = reinterpret_cast<const IST*>(src.values());
4593 input_view_type src_orig(src_raw, src.stride(), numCols);
4594 input_view_type src_view =
4595 Kokkos::subview(src_orig, pair_type(0, numRows), Kokkos::ALL());
4596
4597 constexpr bool src_isConstantStride = true;
4598 Teuchos::ArrayView<const size_t> srcWhichVectors(nullptr, 0);
4599 localDeepCopy(dst.getLocalViewHost(Access::ReadWrite),
4600 src_view,
4601 dst.isConstantStride(),
4603 getMultiVectorWhichVectors(dst),
4605}
4606
4607template <class ST, class LO, class GO, class NT>
4608void deep_copy(Teuchos::SerialDenseMatrix<int, ST>& dst,
4609 const MultiVector<ST, LO, GO, NT>& src) {
4610 using ::Tpetra::Details::localDeepCopy;
4611 using MV = MultiVector<ST, LO, GO, NT>;
4612 using IST = typename MV::impl_scalar_type;
4613 using output_view_type =
4614 Kokkos::View<IST**, Kokkos::LayoutLeft,
4615 Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4616 using pair_type = std::pair<LO, LO>;
4617
4618 const LO numRows = static_cast<LO>(dst.numRows());
4619 const LO numCols = static_cast<LO>(dst.numCols());
4620
4621 TEUCHOS_TEST_FOR_EXCEPTION(numRows != static_cast<LO>(src.getLocalLength()) ||
4622 numCols != static_cast<LO>(src.getNumVectors()),
4623 std::invalid_argument, "Tpetra::deep_copy: On Process " << src.getMap()->getComm()->getRank() << ", src is " << src.getLocalLength() << " x " << src.getNumVectors() << ", but dst is " << numRows << " x " << numCols << ".");
4624
4625 IST* dst_raw = reinterpret_cast<IST*>(dst.values());
4626 output_view_type dst_orig(dst_raw, dst.stride(), numCols);
4627 auto dst_view =
4628 Kokkos::subview(dst_orig, pair_type(0, numRows), Kokkos::ALL());
4629
4630 constexpr bool dst_isConstantStride = true;
4631 Teuchos::ArrayView<const size_t> dstWhichVectors(nullptr, 0);
4632
4633 // Prefer the host version of src's data.
4634 localDeepCopy(dst_view,
4635 src.getLocalViewHost(Access::ReadOnly),
4637 src.isConstantStride(),
4639 getMultiVectorWhichVectors(src));
4640}
4641#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4642
4643template <class Scalar, class LO, class GO, class NT>
4644Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4645createMultiVector(const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4646 size_t numVectors) {
4648 return Teuchos::rcp(new MV(map, numVectors));
4649}
4650
4651template <class ST, class LO, class GO, class NT>
4654 typedef MultiVector<ST, LO, GO, NT> MV;
4655 MV cpy(src.getMap(), src.getNumVectors(), false);
4656 cpy.assign(src);
4657 return cpy;
4658}
4659
4660} // namespace Tpetra
4661
4662//
4663// Explicit instantiation macro
4664//
4665// Must be expanded from within the Tpetra namespace!
4666//
4667
4668#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4669#define TPETRA_MULTIVECTOR_INSTANT(SCALAR, LO, GO, NODE) \
4670 template class MultiVector<SCALAR, LO, GO, NODE>; \
4671 template MultiVector<SCALAR, LO, GO, NODE> createCopy(const MultiVector<SCALAR, LO, GO, NODE>& src); \
4672 template Teuchos::RCP<MultiVector<SCALAR, LO, GO, NODE> > createMultiVector(const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4673 template void deep_copy(MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4674 template void deep_copy(Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4675
4676#else
4677#define TPETRA_MULTIVECTOR_INSTANT(SCALAR, LO, GO, NODE) \
4678 template class MultiVector<SCALAR, LO, GO, NODE>; \
4679 template MultiVector<SCALAR, LO, GO, NODE> createCopy(const MultiVector<SCALAR, LO, GO, NODE>& src);
4680
4681#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4682
4683#define TPETRA_MULTIVECTOR_CONVERT_INSTANT(SO, SI, LO, GO, NODE) \
4684 \
4685 template Teuchos::RCP<MultiVector<SO, LO, GO, NODE> > \
4686 MultiVector<SI, LO, GO, NODE>::convert<SO>() const;
4687
4688#endif // TPETRA_MULTIVECTOR_DEF_HPP
Functions for initializing and finalizing Tpetra.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration of functions for checking whether a given pointer is accessible from a given Kokkos execu...
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
Declaration of a function that prints strings from each process.
Declaration of Tpetra::Details::isInterComm.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
void unpackAndCombine(const RowView &row_ptrs_beg, const RowView &row_ptrs_end, IndicesView &indices, const Kokkos::View< const GlobalOrdinal *, BufferDevice, Kokkos::MemoryUnmanaged > &imports, const Kokkos::View< const size_t *, BufferDevice, Kokkos::MemoryUnmanaged > &num_packets_per_lid, const Kokkos::View< const LocalOrdinal *, BufferDevice, Kokkos::MemoryUnmanaged > &import_lids, const typename CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::padding_type &padding, const bool unpack_pids, const int myRank, const bool verbose)
Perform the unpack operation for the graph.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
static bool assumeMpiIsGPUAware()
Whether to assume that MPI is CUDA aware.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
static size_t multivectorKernelLocationThreshold()
the threshold for transitioning from device to host
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool remoteLIDsContiguous=false, const CombineMode CM=INSERT)
Reallocate imports_ if needed.
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
A parallel distribution of indices over processes.
One or more distributed dense vectors.
void reciprocal(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,...
void normInf(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the infinity-norm of each vector (column), storing the result in a host View.
void get1dCopy(const Teuchos::ArrayView< Scalar > &A, const size_t LDA) const
Fill the given array with a copy of this multivector's local values.
size_t getStride() const
Stride between columns in the multivector.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool areRemoteLIDsContiguous=false, const CombineMode CM=INSERT) override
Reallocate imports_ if needed.
void reduce()
Sum values of a locally replicated multivector across all processes.
virtual bool checkSizes(const SrcDistObject &sourceObj) override
Whether data redistribution between sourceObj and this object is legal.
void randomize()
Set all values in the multivector to pseudorandom numbers.
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subView(const Teuchos::Range1D &colRng) const
Return a const MultiVector with const views of selected columns.
void dot(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Teuchos::ArrayView< dot_type > &dots) const
Compute the dot product of each corresponding pair of vectors (columns) in A and B.
void describeImpl(Teuchos::FancyOStream &out, const std::string &className, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Implementation of describe() for this class, and its subclass Vector.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap) override
Remove processes owning zero rows from the Map and their communicator.
Teuchos::ArrayRCP< Scalar > get1dViewNonConst()
Nonconst persisting (1-D) view of this multivector's local values.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
Teuchos::RCP< Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVectorNonConst(const size_t j)
Return a Vector which is a nonconst view of column j.
void meanValue(const Teuchos::ArrayView< impl_scalar_type > &means) const
Compute mean (average) value of each column.
std::string descriptionImpl(const std::string &className) const
Implementation of description() for this class, and its subclass Vector.
void multiply(Teuchos::ETransp transA, Teuchos::ETransp transB, const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Scalar &beta)
Matrix-matrix multiplication: this = beta*this + alpha*op(A)*op(B).
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
wrapped_dual_view_type view_
The Kokkos::DualView containing the MultiVector's data.
void replaceLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using local (row) index.
void swap(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &mv)
Swap contents of mv with contents of *this.
size_t getLocalLength() const
Local number of rows on the calling process.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subCopy(const Teuchos::Range1D &colRng) const
Return a MultiVector with copies of selected columns.
typename Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner ("dot") product result.
Teuchos::ArrayRCP< const Scalar > getData(size_t j) const
Const view of the local values in a particular vector of this multivector.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM, const execution_space &space) override
Same as copyAndPermute, but do operations in space.
void norm2(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the two-norm of each vector (column), storing the result in a host View.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
Teuchos::ArrayRCP< const Scalar > get1dView() const
Const persisting (1-D) view of this multivector's local values.
Teuchos::ArrayRCP< T > getSubArrayRCP(Teuchos::ArrayRCP< T > arr, size_t j) const
Persisting view of j-th column in the given ArrayRCP.
void replaceGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using global row index.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, device_type > dual_view_type
Kokkos::DualView specialization used by this class.
void elementWiseMultiply(Scalar scalarAB, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarThis)
Multiply a Vector A elementwise by a MultiVector B.
size_t getNumVectors() const
Number of columns in the multivector.
typename Kokkos::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
virtual size_t constantNumberOfPackets() const override
Number of packets to send per LID.
wrapped_dual_view_type getWrappedDualView() const
Return the wrapped dual view holding this MultiVector's local data.
void sumIntoLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &val, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using local row index.
void replaceMap(const Teuchos::RCP< const map_type > &map)
Replace the underlying Map in place.
Teuchos::RCP< MultiVector< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another MultiVector with the same entries, but converted to a different Scalar type T.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
void abs(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise absolute values of input Multi-vector in target: A = abs(this)
void norm1(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the one-norm of each vector (column), storing the result in a host view.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const override
Print the object with the given verbosity level to a FancyOStream.
void get2dCopy(const Teuchos::ArrayView< const Teuchos::ArrayView< Scalar > > &ArrayOfPtrs) const
Fill the given array with a copy of this multivector's local values.
void sumIntoGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using global row index.
bool aliases(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &other) const
Whether this multivector's memory might alias other. This is conservative: if either this or other is...
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on host. This requires that ther...
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subViewNonConst(const Teuchos::Range1D &colRng)
Return a MultiVector with views of selected columns.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
Teuchos::ArrayRCP< Scalar > getDataNonConst(size_t j)
View of the local values in a particular vector of this multivector.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
size_t getOrigNumLocalCols() const
"Original" number of columns in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< Scalar > > get2dViewNonConst()
Return non-const persisting pointers to values.
virtual std::string description() const override
A simple one-line description of this object.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetViewNonConst(const Teuchos::RCP< const map_type > &subMap, const size_t offset)
Return a nonconst view of a subset of rows.
std::string localDescribeToString(const Teuchos::EVerbosityLevel vl) const
Print the calling process' verbose describe() information to the returned string.
bool isSameSize(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &vec) const
size_t getOrigNumLocalRows() const
"Original" number of rows in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< const Scalar > > get2dView() const
Return const persisting pointers to values.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
Abstract base class for objects that can be the source of an Import or Export operation.
Implementation details of Tpetra.
int local_ordinal_type
Default value of Scalar template parameter.
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
bool isInterComm(const Teuchos::Comm< int > &)
Return true if and only if the input communicator wraps an MPI intercommunicator.
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector "constructor": Create a MultiVector from a given Map.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes)
@ INSERT
Insert new values that don't currently exist.