Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_normImpl.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_NORMIMPL_HPP
11#define TPETRA_DETAILS_NORMIMPL_HPP
12
21
22#include "TpetraCore_config.h"
23#include "Kokkos_Core.hpp"
24#include "Teuchos_ArrayView.hpp"
25#include "Teuchos_CommHelpers.hpp"
26#include "KokkosBlas.hpp"
27#include "KokkosKernels_ArithTraits.hpp"
29
30#ifndef DOXYGEN_SHOULD_SKIP_THIS
31namespace Teuchos {
32template <class T>
33class ArrayView; // forward declaration
34template <class OrdinalType>
35class Comm; // forward declaration
36} // namespace Teuchos
37#endif // DOXYGEN_SHOULD_SKIP_THIS
38
40// Declarations start here
42
43namespace Tpetra {
44namespace Details {
45
48 NORM_ONE, //<! Use the one-norm
49 NORM_TWO, //<! Use the two-norm
50 NORM_INF //<! Use the infinity-norm
51};
52
54template <class ValueType,
55 class ArrayLayout,
56 class DeviceType,
57 class MagnitudeType>
59 const Kokkos::View<const ValueType**, ArrayLayout, DeviceType>& X,
61 const Teuchos::ArrayView<const size_t>& whichVecs,
62 const bool isConstantStride,
63 const bool isDistributed,
64 const Teuchos::Comm<int>* comm);
65
66} // namespace Details
67} // namespace Tpetra
68
70// Definitions start here
72
73namespace Tpetra {
74namespace Details {
75namespace Impl {
76
77template <class RV, class XMV>
78void lclNormImpl(const RV& normsOut,
79 const XMV& X,
80 const size_t numVecs,
81 const Teuchos::ArrayView<const size_t>& whichVecs,
82 const bool constantStride,
83 const EWhichNorm whichNorm) {
84 using Kokkos::ALL;
85 using Kokkos::subview;
86 using mag_type = typename RV::non_const_value_type;
87
88 Details::ProfilingRegion region("Tpetra::Details::lclNormImpl");
89
90 static_assert(static_cast<int>(RV::rank) == 1,
91 "Tpetra::MultiVector::lclNormImpl: "
92 "The first argument normsOut must have rank 1.");
93 static_assert(Kokkos::is_view<XMV>::value,
94 "Tpetra::MultiVector::lclNormImpl: "
95 "The second argument X is not a Kokkos::View.");
96 static_assert(static_cast<int>(XMV::rank) == 2,
97 "Tpetra::MultiVector::lclNormImpl: "
98 "The second argument X must have rank 2.");
99
100 const size_t lclNumRows = static_cast<size_t>(X.extent(0));
101 TEUCHOS_TEST_FOR_EXCEPTION(lclNumRows != 0 && constantStride &&
102 static_cast<size_t>(X.extent(1)) != numVecs,
103 std::logic_error, "Constant Stride X's dimensions are " << X.extent(0) << " x " << X.extent(1) << ", which differ from the local dimensions " << lclNumRows << " x " << numVecs << ". Please report this bug to "
104 "the Tpetra developers.");
105 TEUCHOS_TEST_FOR_EXCEPTION(lclNumRows != 0 && !constantStride &&
106 static_cast<size_t>(X.extent(1)) < numVecs,
107 std::logic_error, "Strided X's dimensions are " << X.extent(0) << " x " << X.extent(1) << ", which are incompatible with the local dimensions " << lclNumRows << " x " << numVecs << ". Please report this bug to "
108 "the Tpetra developers.");
109
110 if (lclNumRows == 0) {
111 const mag_type zeroMag = KokkosKernels::ArithTraits<mag_type>::zero();
112 // DEEP_COPY REVIEW - VALUE-TO-DEVICE
113 using execution_space = typename RV::execution_space;
114 Kokkos::deep_copy(execution_space(), normsOut, zeroMag);
115 } else { // lclNumRows != 0
116 if (constantStride) {
117 if (whichNorm == NORM_INF) {
118 KokkosBlas::nrminf(normsOut, X);
119 } else if (whichNorm == NORM_ONE) {
120 KokkosBlas::nrm1(normsOut, X);
121 } else if (whichNorm == NORM_TWO) {
122 KokkosBlas::nrm2_squared(normsOut, X);
123 } else {
124 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Should never get here!");
125 }
126 } else { // not constant stride
127 // NOTE (mfh 15 Jul 2014, 11 Apr 2019) This does a kernel launch
128 // for every column. It might be better to have a kernel that
129 // does the work all at once. On the other hand, we don't
130 // prioritize performance of MultiVector views of noncontiguous
131 // columns.
132 for (size_t k = 0; k < numVecs; ++k) {
133 const size_t X_col = constantStride ? k : whichVecs[k];
134 if (whichNorm == NORM_INF) {
135 KokkosBlas::nrminf(subview(normsOut, k),
136 subview(X, ALL(), X_col));
137 } else if (whichNorm == NORM_ONE) {
138 KokkosBlas::nrm1(subview(normsOut, k),
139 subview(X, ALL(), X_col));
140 } else if (whichNorm == NORM_TWO) {
141 KokkosBlas::nrm2_squared(subview(normsOut, k),
142 subview(X, ALL(), X_col));
143 } else {
144 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Should never get here!");
145 }
146 } // for each column
147 } // constantStride
148 } // lclNumRows != 0
149}
150
151// Kokkos::parallel_for functor for applying square root to each
152// entry of a 1-D Kokkos::View.
153template <class ViewType>
154class SquareRootFunctor {
155 public:
156 typedef typename ViewType::execution_space execution_space;
157 typedef typename ViewType::size_type size_type;
158
159 SquareRootFunctor(const ViewType& theView)
160 : theView_(theView) {}
161
162 KOKKOS_INLINE_FUNCTION void
163 operator()(const size_type& i) const {
164 typedef typename ViewType::non_const_value_type value_type;
165 typedef KokkosKernels::ArithTraits<value_type> KAT;
166 theView_(i) = KAT::sqrt(theView_(i));
167 }
168
169 private:
170 ViewType theView_;
171};
172
173template <class RV>
174void gblNormImpl(const RV& normsOut,
175 const Teuchos::Comm<int>* const comm,
176 const bool distributed,
177 const EWhichNorm whichNorm) {
178 using Teuchos::REDUCE_MAX;
179 using Teuchos::REDUCE_SUM;
180 using Teuchos::reduceAll;
181 typedef typename RV::non_const_value_type mag_type;
182
183 Details::ProfilingRegion region("Tpetra::Details::gblNormImpl");
184
185 const size_t numVecs = normsOut.extent(0);
186
187 // If the MultiVector is distributed over multiple processes, do
188 // the distributed (interprocess) part of the norm. We assume
189 // that the MPI implementation can read from and write to device
190 // memory.
191 //
192 // replaceMap() may have removed some processes. Those processes
193 // have a null Map. They must not participate in any collective
194 // operations. We ask first whether the Map is null, because
195 // isDistributed() defers that question to the Map. We still
196 // compute and return local norms for processes not participating
197 // in collective operations; those probably don't make any sense,
198 // but it doesn't hurt to do them, since it's illegal to call
199 // norm*() on those processes anyway.
200 if (distributed && comm != nullptr) {
201 // The calling process only participates in the collective if
202 // both the Map and its Comm on that process are nonnull.
203
204 const int nv = static_cast<int>(numVecs);
205 const bool commIsInterComm = ::Tpetra::Details::isInterComm(*comm);
206
207 if (commIsInterComm) {
208 RV lclNorms(Kokkos::ViewAllocateWithoutInitializing("MV::normImpl lcl"), numVecs);
209 // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
210 using execution_space = typename RV::execution_space;
211 Kokkos::deep_copy(execution_space(), lclNorms, normsOut);
212 const mag_type* const lclSum = lclNorms.data();
213 mag_type* const gblSum = normsOut.data();
214
215 if (whichNorm == NORM_INF) {
216 reduceAll<int, mag_type>(*comm, REDUCE_MAX, nv, lclSum, gblSum);
217 } else {
218 reduceAll<int, mag_type>(*comm, REDUCE_SUM, nv, lclSum, gblSum);
219 }
220 } else {
221 mag_type* const gblSum = normsOut.data();
222 if (whichNorm == NORM_INF) {
223 reduceAll<int, mag_type>(*comm, REDUCE_MAX, nv, gblSum, gblSum);
224 } else {
225 reduceAll<int, mag_type>(*comm, REDUCE_SUM, nv, gblSum, gblSum);
226 }
227 }
228 }
229
230 if (whichNorm == NORM_TWO) {
231 // Replace the norm-squared results with their square roots in
232 // place, to get the final output. If the device memory and
233 // the host memory are the same, it probably doesn't pay to
234 // launch a parallel kernel for that, since there isn't enough
235 // parallelism for the typical MultiVector case.
236 const bool inHostMemory =
237 std::is_same<typename RV::memory_space,
238 typename RV::host_mirror_space::memory_space>::value;
239 if (inHostMemory) {
240 for (size_t j = 0; j < numVecs; ++j) {
241 normsOut(j) = KokkosKernels::ArithTraits<mag_type>::sqrt(normsOut(j));
242 }
243 } else {
244 // There's not as much parallelism now, but that's OK. The
245 // point of doing parallel dispatch here is to keep the norm
246 // results on the device, thus avoiding a copy to the host
247 // and back again.
248 SquareRootFunctor<RV> f(normsOut);
249 typedef typename RV::execution_space execution_space;
250 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
251 Kokkos::parallel_for(range_type(0, numVecs), f);
252 }
253 }
254}
255
256} // namespace Impl
257
258template <class ValueType,
259 class ArrayLayout,
260 class DeviceType,
261 class MagnitudeType>
263 const Kokkos::View<const ValueType**, ArrayLayout, DeviceType>& X,
264 const EWhichNorm whichNorm,
265 const Teuchos::ArrayView<const size_t>& whichVecs,
266 const bool isConstantStride,
267 const bool isDistributed,
268 const Teuchos::Comm<int>* comm) {
269 using execution_space = typename DeviceType::execution_space;
270 using RV = Kokkos::View<MagnitudeType*, Kokkos::HostSpace>;
271 // using XMV = Kokkos::View<const ValueType**, ArrayLayout, DeviceType>;
272 // using pair_type = std::pair<size_t, size_t>;
273
274 const size_t numVecs = isConstantStride ? static_cast<size_t>(X.extent(1)) : static_cast<size_t>(whichVecs.size());
275 if (numVecs == 0) {
276 return; // nothing to do
277 }
279
280 Impl::lclNormImpl(normsOut, X, numVecs, whichVecs,
281 isConstantStride, whichNorm);
282
283 // lbv 03/15/23: the data from the local norm calculation
284 // better really be available before communication happens
285 // so fencing to make sure the local computations have
286 // completed on device. We might want to make this an
287 // execution space fence down the road?
288 execution_space exec_space_instance = execution_space();
289 exec_space_instance.fence();
290
291 Impl::gblNormImpl(normsOut, comm, isDistributed, whichNorm);
292}
293
294} // namespace Details
295} // namespace Tpetra
296
297#endif // TPETRA_DETAILS_NORMIMPL_HPP
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Struct that holds views of the contents of a CrsMatrix.
Implementation details of Tpetra.
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
EWhichNorm
Input argument for normImpl() (which see).
bool isInterComm(const Teuchos::Comm< int > &)
Return true if and only if the input communicator wraps an MPI intercommunicator.
Namespace Tpetra contains the class and methods constituting the Tpetra library.