Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_normImpl.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_NORMIMPL_HPP
11#define TPETRA_DETAILS_NORMIMPL_HPP
12
21
22#include "TpetraCore_config.h"
23#include "Kokkos_Core.hpp"
24#include "Teuchos_ArrayView.hpp"
25#include "Teuchos_CommHelpers.hpp"
26#include "KokkosBlas.hpp"
27#include "KokkosKernels_ArithTraits.hpp"
28
29#ifndef DOXYGEN_SHOULD_SKIP_THIS
30namespace Teuchos {
31template <class T>
32class ArrayView; // forward declaration
33template <class OrdinalType>
34class Comm; // forward declaration
35} // namespace Teuchos
36#endif // DOXYGEN_SHOULD_SKIP_THIS
37
39// Declarations start here
41
42namespace Tpetra {
43namespace Details {
44
47 NORM_ONE, //<! Use the one-norm
48 NORM_TWO, //<! Use the two-norm
49 NORM_INF //<! Use the infinity-norm
50};
51
53template <class ValueType,
54 class ArrayLayout,
55 class DeviceType,
56 class MagnitudeType>
58 const Kokkos::View<const ValueType**, ArrayLayout, DeviceType>& X,
60 const Teuchos::ArrayView<const size_t>& whichVecs,
61 const bool isConstantStride,
62 const bool isDistributed,
63 const Teuchos::Comm<int>* comm);
64
65} // namespace Details
66} // namespace Tpetra
67
69// Definitions start here
71
72namespace Tpetra {
73namespace Details {
74namespace Impl {
75
76template <class RV, class XMV>
77void lclNormImpl(const RV& normsOut,
78 const XMV& X,
79 const size_t numVecs,
80 const Teuchos::ArrayView<const size_t>& whichVecs,
81 const bool constantStride,
82 const EWhichNorm whichNorm) {
83 using Kokkos::ALL;
84 using Kokkos::subview;
85 using mag_type = typename RV::non_const_value_type;
86
87 static_assert(static_cast<int>(RV::rank) == 1,
88 "Tpetra::MultiVector::lclNormImpl: "
89 "The first argument normsOut must have rank 1.");
90 static_assert(Kokkos::is_view<XMV>::value,
91 "Tpetra::MultiVector::lclNormImpl: "
92 "The second argument X is not a Kokkos::View.");
93 static_assert(static_cast<int>(XMV::rank) == 2,
94 "Tpetra::MultiVector::lclNormImpl: "
95 "The second argument X must have rank 2.");
96
97 const size_t lclNumRows = static_cast<size_t>(X.extent(0));
98 TEUCHOS_TEST_FOR_EXCEPTION(lclNumRows != 0 && constantStride &&
99 static_cast<size_t>(X.extent(1)) != numVecs,
100 std::logic_error, "Constant Stride X's dimensions are " << X.extent(0) << " x " << X.extent(1) << ", which differ from the local dimensions " << lclNumRows << " x " << numVecs << ". Please report this bug to "
101 "the Tpetra developers.");
102 TEUCHOS_TEST_FOR_EXCEPTION(lclNumRows != 0 && !constantStride &&
103 static_cast<size_t>(X.extent(1)) < numVecs,
104 std::logic_error, "Strided X's dimensions are " << X.extent(0) << " x " << X.extent(1) << ", which are incompatible with the local dimensions " << lclNumRows << " x " << numVecs << ". Please report this bug to "
105 "the Tpetra developers.");
106
107 if (lclNumRows == 0) {
108 const mag_type zeroMag = KokkosKernels::ArithTraits<mag_type>::zero();
109 // DEEP_COPY REVIEW - VALUE-TO-DEVICE
110 using execution_space = typename RV::execution_space;
111 Kokkos::deep_copy(execution_space(), normsOut, zeroMag);
112 } else { // lclNumRows != 0
113 if (constantStride) {
114 if (whichNorm == NORM_INF) {
115 KokkosBlas::nrminf(normsOut, X);
116 } else if (whichNorm == NORM_ONE) {
117 KokkosBlas::nrm1(normsOut, X);
118 } else if (whichNorm == NORM_TWO) {
119 KokkosBlas::nrm2_squared(normsOut, X);
120 } else {
121 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Should never get here!");
122 }
123 } else { // not constant stride
124 // NOTE (mfh 15 Jul 2014, 11 Apr 2019) This does a kernel launch
125 // for every column. It might be better to have a kernel that
126 // does the work all at once. On the other hand, we don't
127 // prioritize performance of MultiVector views of noncontiguous
128 // columns.
129 for (size_t k = 0; k < numVecs; ++k) {
130 const size_t X_col = constantStride ? k : whichVecs[k];
131 if (whichNorm == NORM_INF) {
132 KokkosBlas::nrminf(subview(normsOut, k),
133 subview(X, ALL(), X_col));
134 } else if (whichNorm == NORM_ONE) {
135 KokkosBlas::nrm1(subview(normsOut, k),
136 subview(X, ALL(), X_col));
137 } else if (whichNorm == NORM_TWO) {
138 KokkosBlas::nrm2_squared(subview(normsOut, k),
139 subview(X, ALL(), X_col));
140 } else {
141 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Should never get here!");
142 }
143 } // for each column
144 } // constantStride
145 } // lclNumRows != 0
146}
147
148// Kokkos::parallel_for functor for applying square root to each
149// entry of a 1-D Kokkos::View.
150template <class ViewType>
151class SquareRootFunctor {
152 public:
153 typedef typename ViewType::execution_space execution_space;
154 typedef typename ViewType::size_type size_type;
155
156 SquareRootFunctor(const ViewType& theView)
157 : theView_(theView) {}
158
159 KOKKOS_INLINE_FUNCTION void
160 operator()(const size_type& i) const {
161 typedef typename ViewType::non_const_value_type value_type;
162 typedef KokkosKernels::ArithTraits<value_type> KAT;
163 theView_(i) = KAT::sqrt(theView_(i));
164 }
165
166 private:
167 ViewType theView_;
168};
169
170template <class RV>
171void gblNormImpl(const RV& normsOut,
172 const Teuchos::Comm<int>* const comm,
173 const bool distributed,
174 const EWhichNorm whichNorm) {
175 using Teuchos::REDUCE_MAX;
176 using Teuchos::REDUCE_SUM;
177 using Teuchos::reduceAll;
178 typedef typename RV::non_const_value_type mag_type;
179
180 const size_t numVecs = normsOut.extent(0);
181
182 // If the MultiVector is distributed over multiple processes, do
183 // the distributed (interprocess) part of the norm. We assume
184 // that the MPI implementation can read from and write to device
185 // memory.
186 //
187 // replaceMap() may have removed some processes. Those processes
188 // have a null Map. They must not participate in any collective
189 // operations. We ask first whether the Map is null, because
190 // isDistributed() defers that question to the Map. We still
191 // compute and return local norms for processes not participating
192 // in collective operations; those probably don't make any sense,
193 // but it doesn't hurt to do them, since it's illegal to call
194 // norm*() on those processes anyway.
195 if (distributed && comm != nullptr) {
196 // The calling process only participates in the collective if
197 // both the Map and its Comm on that process are nonnull.
198
199 const int nv = static_cast<int>(numVecs);
200 const bool commIsInterComm = ::Tpetra::Details::isInterComm(*comm);
201
202 if (commIsInterComm) {
203 RV lclNorms(Kokkos::ViewAllocateWithoutInitializing("MV::normImpl lcl"), numVecs);
204 // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
205 using execution_space = typename RV::execution_space;
206 Kokkos::deep_copy(execution_space(), lclNorms, normsOut);
207 const mag_type* const lclSum = lclNorms.data();
208 mag_type* const gblSum = normsOut.data();
209
210 if (whichNorm == NORM_INF) {
211 reduceAll<int, mag_type>(*comm, REDUCE_MAX, nv, lclSum, gblSum);
212 } else {
213 reduceAll<int, mag_type>(*comm, REDUCE_SUM, nv, lclSum, gblSum);
214 }
215 } else {
216 mag_type* const gblSum = normsOut.data();
217 if (whichNorm == NORM_INF) {
218 reduceAll<int, mag_type>(*comm, REDUCE_MAX, nv, gblSum, gblSum);
219 } else {
220 reduceAll<int, mag_type>(*comm, REDUCE_SUM, nv, gblSum, gblSum);
221 }
222 }
223 }
224
225 if (whichNorm == NORM_TWO) {
226 // Replace the norm-squared results with their square roots in
227 // place, to get the final output. If the device memory and
228 // the host memory are the same, it probably doesn't pay to
229 // launch a parallel kernel for that, since there isn't enough
230 // parallelism for the typical MultiVector case.
231 const bool inHostMemory =
232 std::is_same<typename RV::memory_space,
233 typename RV::host_mirror_space::memory_space>::value;
234 if (inHostMemory) {
235 for (size_t j = 0; j < numVecs; ++j) {
236 normsOut(j) = KokkosKernels::ArithTraits<mag_type>::sqrt(normsOut(j));
237 }
238 } else {
239 // There's not as much parallelism now, but that's OK. The
240 // point of doing parallel dispatch here is to keep the norm
241 // results on the device, thus avoiding a copy to the host
242 // and back again.
243 SquareRootFunctor<RV> f(normsOut);
244 typedef typename RV::execution_space execution_space;
245 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
246 Kokkos::parallel_for(range_type(0, numVecs), f);
247 }
248 }
249}
250
251} // namespace Impl
252
253template <class ValueType,
254 class ArrayLayout,
255 class DeviceType,
256 class MagnitudeType>
258 const Kokkos::View<const ValueType**, ArrayLayout, DeviceType>& X,
259 const EWhichNorm whichNorm,
260 const Teuchos::ArrayView<const size_t>& whichVecs,
261 const bool isConstantStride,
262 const bool isDistributed,
263 const Teuchos::Comm<int>* comm) {
264 using execution_space = typename DeviceType::execution_space;
265 using RV = Kokkos::View<MagnitudeType*, Kokkos::HostSpace>;
266 // using XMV = Kokkos::View<const ValueType**, ArrayLayout, DeviceType>;
267 // using pair_type = std::pair<size_t, size_t>;
268
269 const size_t numVecs = isConstantStride ? static_cast<size_t>(X.extent(1)) : static_cast<size_t>(whichVecs.size());
270 if (numVecs == 0) {
271 return; // nothing to do
272 }
274
275 Impl::lclNormImpl(normsOut, X, numVecs, whichVecs,
276 isConstantStride, whichNorm);
277
278 // lbv 03/15/23: the data from the local norm calculation
279 // better really be available before communication happens
280 // so fencing to make sure the local computations have
281 // completed on device. We might want to make this an
282 // execution space fence down the road?
283 execution_space exec_space_instance = execution_space();
284 exec_space_instance.fence();
285
286 Impl::gblNormImpl(normsOut, comm, isDistributed, whichNorm);
287}
288
289} // namespace Details
290} // namespace Tpetra
291
292#endif // TPETRA_DETAILS_NORMIMPL_HPP
Struct that holds views of the contents of a CrsMatrix.
Implementation details of Tpetra.
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
EWhichNorm
Input argument for normImpl() (which see).
bool isInterComm(const Teuchos::Comm< int > &)
Return true if and only if the input communicator wraps an MPI intercommunicator.
Namespace Tpetra contains the class and methods constituting the Tpetra library.