Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_normImpl.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_NORMIMPL_HPP
11#define TPETRA_DETAILS_NORMIMPL_HPP
12
21
22#include "TpetraCore_config.h"
23#include "Kokkos_Core.hpp"
24#include "Teuchos_ArrayView.hpp"
25#include "Teuchos_CommHelpers.hpp"
26#include "KokkosBlas.hpp"
27#if KOKKOS_VERSION >= 40799
28#include "KokkosKernels_ArithTraits.hpp"
29#else
30#include "Kokkos_ArithTraits.hpp"
31#endif
32
33#ifndef DOXYGEN_SHOULD_SKIP_THIS
34namespace Teuchos {
35template <class T>
36class ArrayView; // forward declaration
37template <class OrdinalType>
38class Comm; // forward declaration
39} // namespace Teuchos
40#endif // DOXYGEN_SHOULD_SKIP_THIS
41
43// Declarations start here
45
46namespace Tpetra {
47namespace Details {
48
51 NORM_ONE, //<! Use the one-norm
52 NORM_TWO, //<! Use the two-norm
53 NORM_INF //<! Use the infinity-norm
54};
55
57template <class ValueType,
58 class ArrayLayout,
59 class DeviceType,
60 class MagnitudeType>
62 const Kokkos::View<const ValueType**, ArrayLayout, DeviceType>& X,
64 const Teuchos::ArrayView<const size_t>& whichVecs,
65 const bool isConstantStride,
66 const bool isDistributed,
67 const Teuchos::Comm<int>* comm);
68
69} // namespace Details
70} // namespace Tpetra
71
73// Definitions start here
75
76namespace Tpetra {
77namespace Details {
78namespace Impl {
79
80template <class RV, class XMV>
81void lclNormImpl(const RV& normsOut,
82 const XMV& X,
83 const size_t numVecs,
84 const Teuchos::ArrayView<const size_t>& whichVecs,
85 const bool constantStride,
86 const EWhichNorm whichNorm) {
87 using Kokkos::ALL;
88 using Kokkos::subview;
89 using mag_type = typename RV::non_const_value_type;
90
91 static_assert(static_cast<int>(RV::rank) == 1,
92 "Tpetra::MultiVector::lclNormImpl: "
93 "The first argument normsOut must have rank 1.");
94 static_assert(Kokkos::is_view<XMV>::value,
95 "Tpetra::MultiVector::lclNormImpl: "
96 "The second argument X is not a Kokkos::View.");
97 static_assert(static_cast<int>(XMV::rank) == 2,
98 "Tpetra::MultiVector::lclNormImpl: "
99 "The second argument X must have rank 2.");
100
101 const size_t lclNumRows = static_cast<size_t>(X.extent(0));
102 TEUCHOS_TEST_FOR_EXCEPTION(lclNumRows != 0 && constantStride &&
103 static_cast<size_t>(X.extent(1)) != numVecs,
104 std::logic_error, "Constant Stride X's dimensions are " << X.extent(0) << " x " << X.extent(1) << ", which differ from the local dimensions " << lclNumRows << " x " << numVecs << ". Please report this bug to "
105 "the Tpetra developers.");
106 TEUCHOS_TEST_FOR_EXCEPTION(lclNumRows != 0 && !constantStride &&
107 static_cast<size_t>(X.extent(1)) < numVecs,
108 std::logic_error, "Strided X's dimensions are " << X.extent(0) << " x " << X.extent(1) << ", which are incompatible with the local dimensions " << lclNumRows << " x " << numVecs << ". Please report this bug to "
109 "the Tpetra developers.");
110
111 if (lclNumRows == 0) {
112#if KOKKOS_VERSION >= 40799
113 const mag_type zeroMag = KokkosKernels::ArithTraits<mag_type>::zero();
114#else
115 const mag_type zeroMag = Kokkos::ArithTraits<mag_type>::zero();
116#endif
117 // DEEP_COPY REVIEW - VALUE-TO-DEVICE
118 using execution_space = typename RV::execution_space;
119 Kokkos::deep_copy(execution_space(), normsOut, zeroMag);
120 } else { // lclNumRows != 0
121 if (constantStride) {
122 if (whichNorm == NORM_INF) {
123 KokkosBlas::nrminf(normsOut, X);
124 } else if (whichNorm == NORM_ONE) {
125 KokkosBlas::nrm1(normsOut, X);
126 } else if (whichNorm == NORM_TWO) {
127 KokkosBlas::nrm2_squared(normsOut, X);
128 } else {
129 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Should never get here!");
130 }
131 } else { // not constant stride
132 // NOTE (mfh 15 Jul 2014, 11 Apr 2019) This does a kernel launch
133 // for every column. It might be better to have a kernel that
134 // does the work all at once. On the other hand, we don't
135 // prioritize performance of MultiVector views of noncontiguous
136 // columns.
137 for (size_t k = 0; k < numVecs; ++k) {
138 const size_t X_col = constantStride ? k : whichVecs[k];
139 if (whichNorm == NORM_INF) {
140 KokkosBlas::nrminf(subview(normsOut, k),
141 subview(X, ALL(), X_col));
142 } else if (whichNorm == NORM_ONE) {
143 KokkosBlas::nrm1(subview(normsOut, k),
144 subview(X, ALL(), X_col));
145 } else if (whichNorm == NORM_TWO) {
146 KokkosBlas::nrm2_squared(subview(normsOut, k),
147 subview(X, ALL(), X_col));
148 } else {
149 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Should never get here!");
150 }
151 } // for each column
152 } // constantStride
153 } // lclNumRows != 0
154}
155
156// Kokkos::parallel_for functor for applying square root to each
157// entry of a 1-D Kokkos::View.
158template <class ViewType>
159class SquareRootFunctor {
160 public:
161 typedef typename ViewType::execution_space execution_space;
162 typedef typename ViewType::size_type size_type;
163
164 SquareRootFunctor(const ViewType& theView)
165 : theView_(theView) {}
166
167 KOKKOS_INLINE_FUNCTION void
168 operator()(const size_type& i) const {
169 typedef typename ViewType::non_const_value_type value_type;
170#if KOKKOS_VERSION >= 40799
171 typedef KokkosKernels::ArithTraits<value_type> KAT;
172#else
173 typedef Kokkos::ArithTraits<value_type> KAT;
174#endif
175 theView_(i) = KAT::sqrt(theView_(i));
176 }
177
178 private:
179 ViewType theView_;
180};
181
182template <class RV>
183void gblNormImpl(const RV& normsOut,
184 const Teuchos::Comm<int>* const comm,
185 const bool distributed,
186 const EWhichNorm whichNorm) {
187 using Teuchos::REDUCE_MAX;
188 using Teuchos::REDUCE_SUM;
189 using Teuchos::reduceAll;
190 typedef typename RV::non_const_value_type mag_type;
191
192 const size_t numVecs = normsOut.extent(0);
193
194 // If the MultiVector is distributed over multiple processes, do
195 // the distributed (interprocess) part of the norm. We assume
196 // that the MPI implementation can read from and write to device
197 // memory.
198 //
199 // replaceMap() may have removed some processes. Those processes
200 // have a null Map. They must not participate in any collective
201 // operations. We ask first whether the Map is null, because
202 // isDistributed() defers that question to the Map. We still
203 // compute and return local norms for processes not participating
204 // in collective operations; those probably don't make any sense,
205 // but it doesn't hurt to do them, since it's illegal to call
206 // norm*() on those processes anyway.
207 if (distributed && comm != nullptr) {
208 // The calling process only participates in the collective if
209 // both the Map and its Comm on that process are nonnull.
210
211 const int nv = static_cast<int>(numVecs);
212 const bool commIsInterComm = ::Tpetra::Details::isInterComm(*comm);
213
214 if (commIsInterComm) {
215 RV lclNorms(Kokkos::ViewAllocateWithoutInitializing("MV::normImpl lcl"), numVecs);
216 // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
217 using execution_space = typename RV::execution_space;
218 Kokkos::deep_copy(execution_space(), lclNorms, normsOut);
219 const mag_type* const lclSum = lclNorms.data();
220 mag_type* const gblSum = normsOut.data();
221
222 if (whichNorm == NORM_INF) {
223 reduceAll<int, mag_type>(*comm, REDUCE_MAX, nv, lclSum, gblSum);
224 } else {
225 reduceAll<int, mag_type>(*comm, REDUCE_SUM, nv, lclSum, gblSum);
226 }
227 } else {
228 mag_type* const gblSum = normsOut.data();
229 if (whichNorm == NORM_INF) {
230 reduceAll<int, mag_type>(*comm, REDUCE_MAX, nv, gblSum, gblSum);
231 } else {
232 reduceAll<int, mag_type>(*comm, REDUCE_SUM, nv, gblSum, gblSum);
233 }
234 }
235 }
236
237 if (whichNorm == NORM_TWO) {
238 // Replace the norm-squared results with their square roots in
239 // place, to get the final output. If the device memory and
240 // the host memory are the same, it probably doesn't pay to
241 // launch a parallel kernel for that, since there isn't enough
242 // parallelism for the typical MultiVector case.
243 const bool inHostMemory =
244 std::is_same<typename RV::memory_space,
245 typename RV::host_mirror_space::memory_space>::value;
246 if (inHostMemory) {
247 for (size_t j = 0; j < numVecs; ++j) {
248#if KOKKOS_VERSION >= 40799
249 normsOut(j) = KokkosKernels::ArithTraits<mag_type>::sqrt(normsOut(j));
250#else
251 normsOut(j) = Kokkos::ArithTraits<mag_type>::sqrt(normsOut(j));
252#endif
253 }
254 } else {
255 // There's not as much parallelism now, but that's OK. The
256 // point of doing parallel dispatch here is to keep the norm
257 // results on the device, thus avoiding a copy to the host
258 // and back again.
259 SquareRootFunctor<RV> f(normsOut);
260 typedef typename RV::execution_space execution_space;
261 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
262 Kokkos::parallel_for(range_type(0, numVecs), f);
263 }
264 }
265}
266
267} // namespace Impl
268
269template <class ValueType,
270 class ArrayLayout,
271 class DeviceType,
272 class MagnitudeType>
274 const Kokkos::View<const ValueType**, ArrayLayout, DeviceType>& X,
275 const EWhichNorm whichNorm,
276 const Teuchos::ArrayView<const size_t>& whichVecs,
277 const bool isConstantStride,
278 const bool isDistributed,
279 const Teuchos::Comm<int>* comm) {
280 using execution_space = typename DeviceType::execution_space;
281 using RV = Kokkos::View<MagnitudeType*, Kokkos::HostSpace>;
282 // using XMV = Kokkos::View<const ValueType**, ArrayLayout, DeviceType>;
283 // using pair_type = std::pair<size_t, size_t>;
284
285 const size_t numVecs = isConstantStride ? static_cast<size_t>(X.extent(1)) : static_cast<size_t>(whichVecs.size());
286 if (numVecs == 0) {
287 return; // nothing to do
288 }
290
291 Impl::lclNormImpl(normsOut, X, numVecs, whichVecs,
292 isConstantStride, whichNorm);
293
294 // lbv 03/15/23: the data from the local norm calculation
295 // better really be available before communication happens
296 // so fencing to make sure the local computations have
297 // completed on device. We might want to make this an
298 // execution space fence down the road?
299 execution_space exec_space_instance = execution_space();
300 exec_space_instance.fence();
301
302 Impl::gblNormImpl(normsOut, comm, isDistributed, whichNorm);
303}
304
305} // namespace Details
306} // namespace Tpetra
307
308#endif // TPETRA_DETAILS_NORMIMPL_HPP
Struct that holds views of the contents of a CrsMatrix.
Implementation details of Tpetra.
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
EWhichNorm
Input argument for normImpl() (which see).
bool isInterComm(const Teuchos::Comm< int > &)
Return true if and only if the input communicator wraps an MPI intercommunicator.
Namespace Tpetra contains the class and methods constituting the Tpetra library.