Ifpack2 Templated Preconditioning Package Version 1.0
Loading...
Searching...
No Matches
Ifpack2_BlockComputeResidualVector.hpp
1// @HEADER
2// *****************************************************************************
3// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4//
5// Copyright 2009 NTESS and the Ifpack2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
11#define IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
12
13#include "Ifpack2_BlockHelper.hpp"
14
15namespace Ifpack2 {
16
17namespace BlockHelperDetails {
18
22template <typename MatrixType>
23struct AmD {
25 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
26 using size_type_1d_view = typename impl_type::size_type_1d_view;
27 using i64_3d_view = typename impl_type::i64_3d_view;
28 using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
29 // rowptr points to the start of each row of A_colindsub.
30 size_type_1d_view rowptr, rowptr_remote;
31 // Indices into A's rows giving the blocks to extract. rowptr(i) points to
32 // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
33 // where g is A's graph, are the columns AmD uses. If seq_method_, then
34 // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
35 // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
36 // contains the remote ones.
37 local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
38 // Precomputed direct offsets to A,x blocks, for owned entries (OverlapTag case) or all entries (AsyncTag case)
39 i64_3d_view A_x_offsets;
40 // Precomputed direct offsets to A,x blocks, for non-owned entries (OverlapTag case). For AsyncTag case this is left empty.
41 i64_3d_view A_x_offsets_remote;
42
43 // Currently always true.
44 bool is_tpetra_block_crs;
45
46 // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
47 impl_scalar_type_1d_view_tpetra tpetra_values;
48
49 AmD() = default;
50 AmD(const AmD &b) = default;
51};
52
53template <typename MatrixType>
54struct PartInterface {
55 using local_ordinal_type = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type;
56 using local_ordinal_type_1d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view;
57 using local_ordinal_type_2d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_2d_view;
58
59 PartInterface() = default;
60 PartInterface(const PartInterface &b) = default;
61
62 // Some terms:
63 // The matrix A is split as A = D + R, where D is the matrix of tridiag
64 // blocks and R is the remainder.
65 // A part is roughly a synonym for a tridiag. The distinction is that a part
66 // is the set of rows belonging to one tridiag and, equivalently, the off-diag
67 // rows in R associated with that tridiag. In contrast, the term tridiag is
68 // used to refer specifically to tridiag data, such as the pointer into the
69 // tridiag data array.
70 // Local (lcl) row are the LIDs. lclrow lists the LIDs belonging to each
71 // tridiag, and partptr points to the beginning of each tridiag. This is the
72 // LID space.
73 // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
74 // by this ordinal. This is the 'index' space.
75 // A flat index is the mathematical index into an array. A pack index
76 // accounts for SIMD packing.
77
78 // Local row LIDs. Permutation from caller's index space to tridiag index
79 // space.
80 local_ordinal_type_1d_view lclrow;
81 // partptr_ is the pointer array into lclrow_.
82 local_ordinal_type_1d_view partptr; // np+1
83 local_ordinal_type_2d_view partptr_sub;
84 local_ordinal_type_1d_view partptr_schur;
85 // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
86 // is the start of the i'th pack.
87 local_ordinal_type_1d_view packptr; // npack+1
88 local_ordinal_type_1d_view packptr_sub;
89 local_ordinal_type_1d_view packindices_sub;
90 local_ordinal_type_2d_view packindices_schur;
91 // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
92 // an alias of partptr_ in the case of no overlap.
93 local_ordinal_type_1d_view part2rowidx0; // np+1
94 local_ordinal_type_1d_view part2rowidx0_sub;
95 // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
96 // it's the same as part2rowidx0_; if it's > 1, then the value is combined
97 // with i % vector_length to get the location in the packed data.
98 local_ordinal_type_1d_view part2packrowidx0; // np+1
99 local_ordinal_type_2d_view part2packrowidx0_sub;
100 local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
101 // rowidx2part_ maps the row index to the part index.
102 local_ordinal_type_1d_view rowidx2part; // nr
103 local_ordinal_type_1d_view rowidx2part_sub;
104 // True if lcl{row|col} is at most a constant away from row{idx|col}. In
105 // practice, this knowledge is not particularly useful, as packing for batched
106 // processing is done at the same time as the permutation from LID to index
107 // space. But it's easy to detect, so it's recorded in case an optimization
108 // can be made based on it.
109 bool row_contiguous;
110
111 local_ordinal_type max_partsz;
112 local_ordinal_type max_subpartsz;
113 local_ordinal_type n_subparts_per_part;
114 local_ordinal_type nparts;
115};
116
117// Precompute offsets of each A and x entry to speed up residual.
118// (Applies for hasBlockCrsMatrix == true and OverlapTag/AsyncTag)
119// Reading A, x take up to 4, 6 levels of indirection respectively,
120// but precomputing the offsets reduces it to 2 for both.
121//
122// This function allocates and populates these members of AmD:
123// A_x_offsets, A_x_offsets_remote
124template <typename MatrixType>
125void precompute_A_x_offsets(
126 AmD<MatrixType> &amd,
127 const PartInterface<MatrixType> &interf,
128 const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_crs_graph_type> &g,
129 const typename ImplType<MatrixType>::local_ordinal_type_1d_view &dm2cm,
130 int blocksize,
131 bool ownedRemoteSeparate) {
132 using impl_type = ImplType<MatrixType>;
133 using i64_3d_view = typename impl_type::i64_3d_view;
134 using size_type = typename impl_type::size_type;
135 using local_ordinal_type = typename impl_type::local_ordinal_type;
136 using execution_space = typename impl_type::execution_space;
137 auto local_graph = g->getLocalGraphDevice();
138 const auto A_block_rowptr = local_graph.row_map;
139 const auto A_colind = local_graph.entries;
140 local_ordinal_type numLocalRows = interf.rowidx2part.extent(0);
141 int blocksize_square = blocksize * blocksize;
142 // shallow-copying views to avoid capturing the amd, interf objects in lambdas
143 auto lclrow = interf.lclrow;
144 auto A_colindsub = amd.A_colindsub;
145 auto A_colindsub_remote = amd.A_colindsub_remote;
146 auto rowptr = amd.rowptr;
147 auto rowptr_remote = amd.rowptr_remote;
148 bool is_dm2cm_active = dm2cm.extent(0);
149 if (ownedRemoteSeparate) {
150 // amd.rowptr points to owned entries only, and amd.rowptr_remote points to nonowned.
151 local_ordinal_type maxOwnedEntriesPerRow = 0;
152 local_ordinal_type maxNonownedEntriesPerRow = 0;
153 Kokkos::parallel_reduce(
154 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
155 KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type & lmaxOwned, local_ordinal_type & lmaxNonowned) {
156 const local_ordinal_type lr = lclrow(i);
157 local_ordinal_type rowNumOwned = rowptr(lr + 1) - rowptr(lr);
158 if (rowNumOwned > lmaxOwned)
159 lmaxOwned = rowNumOwned;
160 // rowptr_remote won't be allocated for single-rank problems
161 if (rowptr_remote.extent(0)) {
162 local_ordinal_type rowNumNonowned = rowptr_remote(lr + 1) - rowptr_remote(lr);
163 if (rowNumNonowned > lmaxNonowned)
164 lmaxNonowned = rowNumNonowned;
165 } else {
166 lmaxNonowned = 0;
167 }
168 },
169 Kokkos::Max<local_ordinal_type>(maxOwnedEntriesPerRow), Kokkos::Max<local_ordinal_type>(maxNonownedEntriesPerRow));
170 // Allocate the two offsets views now that we know the dimensions
171 // For each one, the middle dimension is 0 for A offsets and 1 for x offsets.
172 // Packing them together in one view improves cache line utilization
173 amd.A_x_offsets = i64_3d_view("amd.A_x_offsets", numLocalRows, 2, maxOwnedEntriesPerRow);
174 amd.A_x_offsets_remote = i64_3d_view("amd.A_x_offsets_remote", numLocalRows, 2, maxNonownedEntriesPerRow);
175 auto A_x_offsets = amd.A_x_offsets;
176 auto A_x_offsets_remote = amd.A_x_offsets_remote;
177 // Now, populate all the offsets. Use ArithTraits<int64_t>::min to mark absent entries.
178 Kokkos::parallel_for(
179 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
180 KOKKOS_LAMBDA(local_ordinal_type i) {
181 const local_ordinal_type lr = lclrow(i);
182 const size_type A_k0 = A_block_rowptr(lr);
183 // Owned entries
184 size_type rowBegin = rowptr(lr);
185 local_ordinal_type rowNumOwned = rowptr(lr + 1) - rowBegin;
186 for (local_ordinal_type entry = 0; entry < maxOwnedEntriesPerRow; entry++) {
187 if (entry < rowNumOwned) {
188 const size_type j = A_k0 + A_colindsub(rowBegin + entry);
189 const local_ordinal_type A_colind_at_j = A_colind(j);
190 const local_ordinal_type loc = is_dm2cm_active ? dm2cm(A_colind_at_j) : A_colind_at_j;
191 A_x_offsets(i, 0, entry) = int64_t(j) * blocksize_square;
192 A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
193 } else {
194#if KOKKOS_VERSION >= 40799
195 A_x_offsets(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
196#else
197 A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
198#endif
199#if KOKKOS_VERSION >= 40799
200 A_x_offsets(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
201#else
202 A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
203#endif
204 }
205 }
206 // Nonowned entries
207 if (rowptr_remote.extent(0)) {
208 rowBegin = rowptr_remote(lr);
209 local_ordinal_type rowNumNonowned = rowptr_remote(lr + 1) - rowBegin;
210 for (local_ordinal_type entry = 0; entry < maxNonownedEntriesPerRow; entry++) {
211 if (entry < rowNumNonowned) {
212 const size_type j = A_k0 + A_colindsub_remote(rowBegin + entry);
213 const local_ordinal_type A_colind_at_j = A_colind(j);
214 const local_ordinal_type loc = A_colind_at_j - numLocalRows;
215 A_x_offsets_remote(i, 0, entry) = int64_t(j) * blocksize_square;
216 A_x_offsets_remote(i, 1, entry) = int64_t(loc) * blocksize;
217 } else {
218#if KOKKOS_VERSION >= 40799
219 A_x_offsets_remote(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
220#else
221 A_x_offsets_remote(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
222#endif
223#if KOKKOS_VERSION >= 40799
224 A_x_offsets_remote(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
225#else
226 A_x_offsets_remote(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
227#endif
228 }
229 }
230 }
231 });
232 } else {
233 // amd.rowptr points to both owned and nonowned entries, so it tells us how many columns (last dim) A_x_offsets should have.
234 local_ordinal_type maxEntriesPerRow = 0;
235 Kokkos::parallel_reduce(
236 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
237 KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type & lmax) {
238 const local_ordinal_type lr = lclrow(i);
239 local_ordinal_type rowNum = rowptr(lr + 1) - rowptr(lr);
240 if (rowNum > lmax)
241 lmax = rowNum;
242 },
243 Kokkos::Max<local_ordinal_type>(maxEntriesPerRow));
244 amd.A_x_offsets = i64_3d_view("amd.A_x_offsets", numLocalRows, 2, maxEntriesPerRow);
245 auto A_x_offsets = amd.A_x_offsets;
246 // Populate A,x offsets. Use ArithTraits<int64_t>::min to mark absent entries.
247 // For x offsets, add a shift blocksize*numLocalRows to represent that it indexes into x_remote instead of x.
248 Kokkos::parallel_for(
249 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
250 KOKKOS_LAMBDA(local_ordinal_type i) {
251 const local_ordinal_type lr = lclrow(i);
252 const size_type A_k0 = A_block_rowptr(lr);
253 // Owned entries
254 size_type rowBegin = rowptr(lr);
255 local_ordinal_type rowOwned = rowptr(lr + 1) - rowBegin;
256 for (local_ordinal_type entry = 0; entry < maxEntriesPerRow; entry++) {
257 if (entry < rowOwned) {
258 const size_type j = A_k0 + A_colindsub(rowBegin + entry);
259 A_x_offsets(i, 0, entry) = j * blocksize_square;
260 const local_ordinal_type A_colind_at_j = A_colind(j);
261 if (A_colind_at_j < numLocalRows) {
262 const local_ordinal_type loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
263 A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
264 } else {
265 A_x_offsets(i, 1, entry) = int64_t(A_colind_at_j) * blocksize;
266 }
267 } else {
268#if KOKKOS_VERSION >= 40799
269 A_x_offsets(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
270#else
271 A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
272#endif
273#if KOKKOS_VERSION >= 40799
274 A_x_offsets(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
275#else
276 A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
277#endif
278 }
279 }
280 });
281 }
282}
283
287static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
288 const int team_size) {
289 int total_team_size(0);
290 if (blksize <= 5)
291 total_team_size = 32;
292 else if (blksize <= 9)
293 total_team_size = 32; // 64
294 else if (blksize <= 12)
295 total_team_size = 96;
296 else if (blksize <= 16)
297 total_team_size = 128;
298 else if (blksize <= 20)
299 total_team_size = 160;
300 else
301 total_team_size = 160;
302 return total_team_size / team_size;
303}
304
305static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
306 const int team_size) {
307 int total_team_size(0);
308 if (blksize <= 5)
309 total_team_size = 32;
310 else if (blksize <= 9)
311 total_team_size = 32; // 64
312 else if (blksize <= 12)
313 total_team_size = 96;
314 else if (blksize <= 16)
315 total_team_size = 128;
316 else if (blksize <= 20)
317 total_team_size = 160;
318 else
319 total_team_size = 160;
320 return total_team_size / team_size;
321}
322
323static inline int ComputeResidualVectorRecommendedSYCLVectorSize(const int blksize,
324 const int team_size) {
325 int total_team_size(0);
326 if (blksize <= 5)
327 total_team_size = 32;
328 else if (blksize <= 9)
329 total_team_size = 32; // 64
330 else if (blksize <= 12)
331 total_team_size = 96;
332 else if (blksize <= 16)
333 total_team_size = 128;
334 else if (blksize <= 20)
335 total_team_size = 160;
336 else
337 total_team_size = 160;
338 return total_team_size / team_size;
339}
340
341template <typename T>
342static inline int ComputeResidualVectorRecommendedVectorSize(const int blksize,
343 const int team_size) {
344 if (is_cuda<T>::value)
345 return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
346 if (is_hip<T>::value)
347 return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
348 if (is_sycl<T>::value)
349 return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
350 return -1;
351}
352
353template <typename MatrixType>
354struct ComputeResidualVector {
355 public:
356 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
357 using node_device_type = typename impl_type::node_device_type;
358 using execution_space = typename impl_type::execution_space;
359 using memory_space = typename impl_type::memory_space;
360
361 using local_ordinal_type = typename impl_type::local_ordinal_type;
362 using size_type = typename impl_type::size_type;
363 using impl_scalar_type = typename impl_type::impl_scalar_type;
364 using magnitude_type = typename impl_type::magnitude_type;
365 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
366 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
368 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
369 using size_type_1d_view = typename impl_type::size_type_1d_view;
370 using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
371 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
372 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
373 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
374 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
375 using i64_3d_view = typename impl_type::i64_3d_view;
376 static constexpr int vector_length = impl_type::vector_length;
377
379 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
380
381 // enum for max blocksize and vector length
382 enum : int { max_blocksize = 32 };
383
384 private:
385 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
386 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
387 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
388 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
389 Unmanaged<vector_type_3d_view> y_packed;
390 Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
391
392 // AmD information
393 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
394 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
395 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
396
397 // block crs graph information
398 // for cuda (kokkos crs graph uses a different size_type from size_t)
399 const ConstUnmanaged<Kokkos::View<size_t *, node_device_type>> A_block_rowptr;
400 const ConstUnmanaged<Kokkos::View<size_t *, node_device_type>> A_point_rowptr;
401 const ConstUnmanaged<Kokkos::View<local_ordinal_type *, node_device_type>> A_colind;
402
403 // blocksize
404 const local_ordinal_type blocksize_requested;
405
406 // part interface
407 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
408 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
409 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
410 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
411 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
412 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
413
414 // block offsets
415 const ConstUnmanaged<i64_3d_view> A_x_offsets;
416 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
417
418 const bool is_dm2cm_active;
419 const bool hasBlockCrsMatrix;
420
421 public:
422 template <typename LocalCrsGraphType>
423 ComputeResidualVector(const AmD<MatrixType> &amd,
424 const LocalCrsGraphType &block_graph,
425 const LocalCrsGraphType &point_graph,
426 const local_ordinal_type &blocksize_requested_,
427 const PartInterface<MatrixType> &interf,
428 const local_ordinal_type_1d_view &dm2cm_,
429 bool hasBlockCrsMatrix_)
430 : rowptr(amd.rowptr)
431 , rowptr_remote(amd.rowptr_remote)
432 , colindsub(amd.A_colindsub)
433 , colindsub_remote(amd.A_colindsub_remote)
434 , tpetra_values(amd.tpetra_values)
435 , A_block_rowptr(block_graph.row_map)
436 , A_point_rowptr(point_graph.row_map)
437 , A_colind(block_graph.entries)
438 , blocksize_requested(blocksize_requested_)
439 , part2packrowidx0(interf.part2packrowidx0)
440 , part2rowidx0(interf.part2rowidx0)
441 , rowidx2part(interf.rowidx2part)
442 , partptr(interf.partptr)
443 , lclrow(interf.lclrow)
444 , dm2cm(dm2cm_)
445 , A_x_offsets(amd.A_x_offsets)
446 , A_x_offsets_remote(amd.A_x_offsets_remote)
447 , is_dm2cm_active(dm2cm_.span() > 0)
448 , hasBlockCrsMatrix(hasBlockCrsMatrix_) {}
449
450 inline void
451 SerialDot(const local_ordinal_type &blocksize,
452 const local_ordinal_type &lclRowID,
453 const local_ordinal_type &lclColID,
454 const local_ordinal_type &ii,
455 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub_,
456 const impl_scalar_type *const KOKKOS_RESTRICT xx,
457 /* */ impl_scalar_type *KOKKOS_RESTRICT yy) const {
458 const size_type Aj_c = colindsub_(lclColID);
459 auto point_row_offset = A_point_rowptr(lclRowID * blocksize + ii) + Aj_c * blocksize;
460 impl_scalar_type val = 0;
461#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
462#pragma ivdep
463#endif
464#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
465#pragma unroll
466#endif
467 for (local_ordinal_type k1 = 0; k1 < blocksize; ++k1)
468 val += tpetra_values(point_row_offset + k1) * xx[k1];
469 yy[ii] -= val;
470 }
471
472 inline void
473 SerialGemv(const local_ordinal_type &blocksize,
474 const impl_scalar_type *const KOKKOS_RESTRICT AA,
475 const impl_scalar_type *const KOKKOS_RESTRICT xx,
476 /* */ impl_scalar_type *KOKKOS_RESTRICT yy) const {
477 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
478 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0) {
479 impl_scalar_type val = 0;
480#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
481#pragma ivdep
482#endif
483#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
484#pragma unroll
485#endif
486 for (local_ordinal_type k1 = 0; k1 < blocksize; ++k1)
487 val += AA[tlb::getFlatIndex(k0, k1, blocksize)] * xx[k1];
488 yy[k0] -= val;
489 }
490 }
491
492 template <typename bbViewType, typename yyViewType>
493 KOKKOS_INLINE_FUNCTION void
494 VectorCopy(const member_type &member,
495 const local_ordinal_type &blocksize,
496 const bbViewType &bb,
497 const yyViewType &yy) const {
498 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
499 yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
500 });
501 }
502
503 template <typename xxViewType, typename yyViewType>
504 KOKKOS_INLINE_FUNCTION void
505 VectorDot(const member_type &member,
506 const local_ordinal_type &blocksize,
507 const local_ordinal_type &lclRowID,
508 const local_ordinal_type &lclColID,
509 const local_ordinal_type &ii,
510 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub_,
511 const xxViewType &xx,
512 const yyViewType &yy) const {
513 const size_type Aj_c = colindsub_(lclColID);
514 auto point_row_offset = A_point_rowptr(lclRowID * blocksize + ii) + Aj_c * blocksize;
515 impl_scalar_type val = 0;
516 Kokkos::parallel_reduce(
517 Kokkos::ThreadVectorRange(member, blocksize),
518 [&](const local_ordinal_type &k1, impl_scalar_type &update) {
519 update += tpetra_values(point_row_offset + k1) * xx(k1);
520 },
521 val);
522 Kokkos::single(Kokkos::PerThread(member),
523 [=]() {
524 Kokkos::atomic_add(&yy(ii), typename yyViewType::const_value_type(-val));
525 });
526 }
527
528 // BMK: This version coalesces accesses to AA for LayoutRight blocks.
529 template <typename AAViewType, typename xxViewType, typename yyViewType>
530 KOKKOS_INLINE_FUNCTION void
531 VectorGemv(const member_type &member,
532 const local_ordinal_type &blocksize,
533 const AAViewType &AA,
534 const xxViewType &xx,
535 const yyViewType &yy) const {
536 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0) {
537 impl_scalar_type val = 0;
538 Kokkos::parallel_reduce(
539 Kokkos::ThreadVectorRange(member, blocksize),
540 [&](const local_ordinal_type &k1, impl_scalar_type &update) {
541 update += AA(k0, k1) * xx(k1);
542 },
543 val);
544 Kokkos::single(Kokkos::PerThread(member),
545 [=]() {
546 Kokkos::atomic_add(&yy(k0), -val);
547 });
548 }
549 }
550
551 struct SeqTag {};
552
553 KOKKOS_INLINE_FUNCTION
554 void
555 operator()(const SeqTag &, const local_ordinal_type &i) const {
556 const local_ordinal_type blocksize = blocksize_requested;
557 const local_ordinal_type blocksize_square = blocksize * blocksize;
558
559 // constants
560 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
561 const local_ordinal_type num_vectors = y.extent(1);
562 const local_ordinal_type row = i * blocksize;
563 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
564 // y := b
565 impl_scalar_type *yy = &y(row, col);
566 const impl_scalar_type *const bb = &b(row, col);
567 memcpy(yy, bb, sizeof(impl_scalar_type) * blocksize);
568
569 // y -= Rx
570 const size_type A_k0 = A_block_rowptr[i];
571 for (size_type k = rowptr[i]; k < rowptr[i + 1]; ++k) {
572 const size_type j = A_k0 + colindsub[k];
573 const impl_scalar_type *const xx = &x(A_colind[j] * blocksize, col);
574 if (hasBlockCrsMatrix) {
575 const impl_scalar_type *const AA = &tpetra_values(j * blocksize_square);
576 SerialGemv(blocksize, AA, xx, yy);
577 } else {
578 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
579 SerialDot(blocksize, i, k, k0, colindsub, xx, yy);
580 }
581 }
582 }
583 }
584
585 KOKKOS_INLINE_FUNCTION
586 void
587 operator()(const SeqTag &, const member_type &member) const {
588 // constants
589 const local_ordinal_type blocksize = blocksize_requested;
590 const local_ordinal_type blocksize_square = blocksize * blocksize;
591
592 const local_ordinal_type lr = member.league_rank();
593 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
594 const local_ordinal_type num_vectors = y.extent(1);
595
596 // subview pattern
597 auto bb = Kokkos::subview(b, block_range, 0);
598 auto xx = bb;
599 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
600
601 const local_ordinal_type row = lr * blocksize;
602 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
603 // y := b
604 auto yy = Kokkos::subview(y, Kokkos::make_pair(row, row + blocksize), col);
605 bb.assign_data(&b(row, col));
606 if (member.team_rank() == 0)
607 VectorCopy(member, blocksize, bb, yy);
608 member.team_barrier();
609
610 // y -= Rx
611 const size_type A_k0 = A_block_rowptr[lr];
612
613 if (hasBlockCrsMatrix) {
614 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr + 1]),
615 [&](const local_ordinal_type &k) {
616 const size_type j = A_k0 + colindsub[k];
617 xx.assign_data(&x(A_colind[j] * blocksize, col));
618 A_block_cst.assign_data(&tpetra_values(j * blocksize_square));
619 VectorGemv(member, blocksize, A_block_cst, xx, yy);
620 });
621 } else {
622 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr + 1]),
623 [&](const local_ordinal_type &k) {
624 const size_type j = A_k0 + colindsub[k];
625 xx.assign_data(&x(A_colind[j] * blocksize, col));
626
627 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
628 VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
629 });
630 }
631 }
632 }
633
634 // * B: block size for compile-time specialization, or 0 for general case (up to max_blocksize)
635 // * async: true if using async importer. overlap is not used in this case.
636 // Whether a column is owned or nonowned is decided at runtime.
637 // * overlap: true if processing the columns that are not locally owned,
638 // false if processing locally owned columns.
639 // * haveBlockMatrix: true if A is a BlockCrsMatrix, false if it's CrsMatrix.
640 template <int B, bool async, bool overlap, bool haveBlockMatrix>
641 struct GeneralTag {
642 static_assert(!(async && overlap),
643 "ComputeResidualVector: async && overlap is not a valid configuration for GeneralTag");
644 };
645
646 // Define AsyncTag and OverlapTag in terms of GeneralTag:
647 // P == 0 means only compute on owned columns
648 // P == 1 means only compute on nonowned columns
649 template <int P, int B, bool haveBlockMatrix>
650 using OverlapTag = GeneralTag<B, false, P != 0, haveBlockMatrix>;
651
652 template <int B, bool haveBlockMatrix>
653 using AsyncTag = GeneralTag<B, true, false, haveBlockMatrix>;
654
655 // CPU implementation for all cases
656 template <int B, bool async, bool overlap, bool haveBlockMatrix>
657 KOKKOS_INLINE_FUNCTION void
658 operator()(const GeneralTag<B, async, overlap, haveBlockMatrix> &, const local_ordinal_type &rowidx) const {
659 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
660
661 // constants
662 const local_ordinal_type partidx = rowidx2part(rowidx);
663 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
664 const local_ordinal_type v = partidx % vector_length;
665
666 const local_ordinal_type num_vectors = y_packed.extent(2);
667 const local_ordinal_type num_local_rows = lclrow.extent(0);
668
669 // temporary buffer for y flat
670 impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
671
672 const local_ordinal_type lr = lclrow(rowidx);
673
674 auto colindsub_used = overlap ? colindsub_remote : colindsub;
675 auto rowptr_used = overlap ? rowptr_remote : rowptr;
676
677 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
678 if constexpr (overlap) {
679 // y (temporary) := 0
680 memset((void *)yy, 0, sizeof(impl_scalar_type) * blocksize);
681 } else {
682 // y := b
683 const local_ordinal_type row = lr * blocksize;
684 memcpy(yy, &b(row, col), sizeof(impl_scalar_type) * blocksize);
685 }
686
687 // y -= Rx
688 const size_type A_k0 = A_block_rowptr[lr];
689 for (size_type k = rowptr_used[lr]; k < rowptr_used[lr + 1]; ++k) {
690 const size_type j = A_k0 + colindsub_used[k];
691 const local_ordinal_type A_colind_at_j = A_colind[j];
692 if constexpr (haveBlockMatrix) {
693 const local_ordinal_type blocksize_square = blocksize * blocksize;
694 const impl_scalar_type *const AA = &tpetra_values(j * blocksize_square);
695 if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
696 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
697 const impl_scalar_type *const xx = &x(loc * blocksize, col);
698 SerialGemv(blocksize, AA, xx, yy);
699 } else {
700 const auto loc = A_colind_at_j - num_local_rows;
701 const impl_scalar_type *const xx_remote = &x_remote(loc * blocksize, col);
702 SerialGemv(blocksize, AA, xx_remote, yy);
703 }
704 } else {
705 if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
706 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
707 const impl_scalar_type *const xx = &x(loc * blocksize, col);
708 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
709 SerialDot(blocksize, lr, k, k0, colindsub_used, xx, yy);
710 } else {
711 const auto loc = A_colind_at_j - num_local_rows;
712 const impl_scalar_type *const xx_remote = &x_remote(loc * blocksize, col);
713 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
714 SerialDot(blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
715 }
716 }
717 }
718 // move yy to y_packed
719 if constexpr (overlap) {
720 for (local_ordinal_type k = 0; k < blocksize; ++k)
721 y_packed(pri, k, col)[v] += yy[k];
722 } else {
723 for (local_ordinal_type k = 0; k < blocksize; ++k)
724 y_packed(pri, k, col)[v] = yy[k];
725 }
726 }
727 }
728
729 // GPU implementation for hasBlockCrsMatrix == true
730 template <int B, bool async, bool overlap>
731 KOKKOS_INLINE_FUNCTION void
732 operator()(const GeneralTag<B, async, overlap, true> &, const member_type &member) const {
733 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
734
735 // constants
736 const local_ordinal_type rowidx = member.league_rank();
737 const local_ordinal_type partidx = rowidx2part(rowidx);
738 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
739 const local_ordinal_type v = partidx % vector_length;
740
741 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
742 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
743 const local_ordinal_type num_local_rows = lclrow.extent(0);
744
745 // subview pattern
746 auto bb = Kokkos::subview(b, block_range, 0);
747 auto xx = bb;
748 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
749 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
750
751 // Get shared allocation for a local copy of x, Ax, and A
752 impl_scalar_type *local_Ax = reinterpret_cast<impl_scalar_type *>(member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
753 impl_scalar_type *local_x = reinterpret_cast<impl_scalar_type *>(member.thread_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
754
755 const local_ordinal_type lr = lclrow(rowidx);
756 const local_ordinal_type row = lr * blocksize;
757 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
758 if (col)
759 member.team_barrier();
760 // y -= Rx
761 // Initialize accumulation array
762 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), [&](const local_ordinal_type &i) {
763 local_Ax[i] = 0;
764 });
765 member.team_barrier();
766
767 int numEntries;
768 if constexpr (!overlap) {
769 numEntries = A_x_offsets.extent(2);
770 } else {
771 numEntries = A_x_offsets_remote.extent(2);
772 }
773
774 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numEntries),
775 [&](const int k) {
776 int64_t A_offset = overlap ? A_x_offsets_remote(rowidx, 0, k) : A_x_offsets(rowidx, 0, k);
777 int64_t x_offset = overlap ? A_x_offsets_remote(rowidx, 1, k) : A_x_offsets(rowidx, 1, k);
778#if KOKKOS_VERSION >= 40799
779 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
780#else
781 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
782#endif
783 A_block_cst.assign_data(tpetra_values.data() + A_offset);
784 // Pull x into local memory
785 if constexpr (async) {
786 size_type remote_cutoff = blocksize * num_local_rows;
787 if (x_offset >= remote_cutoff)
788 xx.assign_data(&x_remote(x_offset - remote_cutoff, col));
789 else
790 xx.assign_data(&x(x_offset, col));
791 } else {
792 if constexpr (!overlap) {
793 xx.assign_data(&x(x_offset, col));
794 } else {
795 xx.assign_data(&x_remote(x_offset, col));
796 }
797 }
798
799 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &i) {
800 local_x[i] = xx(i);
801 });
802
803 // MatVec op Ax += A*x
804 Kokkos::parallel_for(
805 Kokkos::ThreadVectorRange(member, blocksize),
806 [&](const local_ordinal_type &k0) {
807 impl_scalar_type val = 0;
808 for (int k1 = 0; k1 < blocksize; k1++)
809 val += A_block_cst(k0, k1) * local_x[k1];
810 Kokkos::atomic_add(local_Ax + k0, val);
811 });
812 }
813 });
814 member.team_barrier();
815 // Update y = b - local_Ax
816 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
817 bb.assign_data(&b(row, col));
818 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), [&](const local_ordinal_type &i) {
819 if (!overlap)
820 yy(i) = bb(i) - local_Ax[i];
821 else
822 yy(i) -= local_Ax[i];
823 });
824 }
825 }
826
827 // GPU implementation for hasBlockCrsMatrix == false
828 template <int B, bool async, bool overlap>
829 KOKKOS_INLINE_FUNCTION void
830 operator()(const GeneralTag<B, async, overlap, false> &, const member_type &member) const {
831 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
832
833 // constants
834 const local_ordinal_type rowidx = member.league_rank();
835 const local_ordinal_type partidx = rowidx2part(rowidx);
836 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
837 const local_ordinal_type v = partidx % vector_length;
838
839 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
840 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
841 const local_ordinal_type num_local_rows = lclrow.extent(0);
842
843 // subview pattern
844 auto bb = Kokkos::subview(b, block_range, 0);
845 auto xx = bb;
846 auto xx_remote = bb;
847 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
848 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
849 auto colindsub_used = overlap ? colindsub_remote : colindsub;
850 auto rowptr_used = overlap ? rowptr_remote : rowptr;
851
852 const local_ordinal_type lr = lclrow(rowidx);
853 const local_ordinal_type row = lr * blocksize;
854 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
855 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
856 if (!overlap) {
857 // y := b
858 bb.assign_data(&b(row, col));
859 if (member.team_rank() == 0)
860 VectorCopy(member, blocksize, bb, yy);
861 member.team_barrier();
862 }
863
864 // y -= Rx
865 const size_type A_k0 = A_block_rowptr[lr];
866 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr + 1]),
867 [&](const local_ordinal_type &k) {
868 const size_type j = A_k0 + colindsub_used[k];
869 const local_ordinal_type A_colind_at_j = A_colind[j];
870 if ((async && A_colind_at_j < num_local_rows) || (!async && !overlap)) {
871 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
872 xx.assign_data(&x(loc * blocksize, col));
873 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
874 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx, yy);
875 } else {
876 const auto loc = A_colind_at_j - num_local_rows;
877 xx_remote.assign_data(&x_remote(loc * blocksize, col));
878 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
879 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
880 }
881 });
882 }
883 }
884
885 // y = b - Rx; seq method
886 template <typename MultiVectorLocalViewTypeY,
887 typename MultiVectorLocalViewTypeB,
888 typename MultiVectorLocalViewTypeX>
889 void run(const MultiVectorLocalViewTypeY &y_,
890 const MultiVectorLocalViewTypeB &b_,
891 const MultiVectorLocalViewTypeX &x_) {
892 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
893 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<SeqTag>", ComputeResidual0, execution_space);
894
895 y = y_;
896 b = b_;
897 x = x_;
898 if constexpr (is_device<execution_space>::value) {
899 const local_ordinal_type blocksize = blocksize_requested;
900 const local_ordinal_type team_size = 8;
901 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
902 const Kokkos::TeamPolicy<execution_space, SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
903 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
904 } else {
905 const Kokkos::RangePolicy<execution_space, SeqTag> policy(0, rowptr.extent(0) - 1);
906 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
907 }
908 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
909 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
910 }
911
912 // y = b - R (x , x_remote)
913 template <typename MultiVectorLocalViewTypeB,
914 typename MultiVectorLocalViewTypeX,
915 typename MultiVectorLocalViewTypeX_Remote>
916 void run(const vector_type_3d_view &y_packed_,
917 const MultiVectorLocalViewTypeB &b_,
918 const MultiVectorLocalViewTypeX &x_,
919 const MultiVectorLocalViewTypeX_Remote &x_remote_) {
920 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
921 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<AsyncTag>", ComputeResidual0, execution_space);
922
923 b = b_;
924 x = x_;
925 x_remote = x_remote_;
926 if constexpr (is_device<execution_space>::value) {
927 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type *)y_packed_.data(),
928 y_packed_.extent(0),
929 y_packed_.extent(1),
930 y_packed_.extent(2),
931 vector_length);
932 } else {
933 y_packed = y_packed_;
934 }
935
936 if constexpr (is_device<execution_space>::value) {
937 const local_ordinal_type blocksize = blocksize_requested;
938 // local_ordinal_type vl_power_of_two = 1;
939 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
940 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
941 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
942#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
943 { \
944 if (this->hasBlockCrsMatrix) { \
945 const local_ordinal_type team_size = 8; \
946 const local_ordinal_type vector_size = 8; \
947 const size_t shmem_team_size = blocksize * sizeof(btdm_scalar_type); \
948 const size_t shmem_thread_size = blocksize * sizeof(btdm_scalar_type); \
949 Kokkos::TeamPolicy<execution_space, AsyncTag<B, true>> \
950 policy(rowidx2part.extent(0), team_size, vector_size); \
951 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
952 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<AsyncTag>", \
953 policy, *this); \
954 } else { \
955 const local_ordinal_type team_size = 8; \
956 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
957 const Kokkos::TeamPolicy<execution_space, AsyncTag<B, false>> \
958 policy(rowidx2part.extent(0), team_size, vector_size); \
959 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<AsyncTag>", \
960 policy, *this); \
961 } \
962 } \
963 break
964 switch (blocksize_requested) {
965 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
966 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
967 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
968 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
969 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
970 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
971 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
972 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
973 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
974 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
975 }
976#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
977 } else {
978#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
979 { \
980 if (this->hasBlockCrsMatrix) { \
981 const Kokkos::RangePolicy<execution_space, AsyncTag<B, true>> policy(0, rowidx2part.extent(0)); \
982 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<AsyncTag>", \
983 policy, *this); \
984 } else { \
985 const Kokkos::RangePolicy<execution_space, AsyncTag<B, false>> policy(0, rowidx2part.extent(0)); \
986 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<AsyncTag>", \
987 policy, *this); \
988 } \
989 } \
990 break
991
992 switch (blocksize_requested) {
993 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
994 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
995 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
996 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
997 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
998 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
999 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1000 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1001 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1002 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
1003 }
1004#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1005 }
1006 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
1007 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
1008 }
1009
1010 // y = b - R (y , y_remote)
1011 template <typename MultiVectorLocalViewTypeB,
1012 typename MultiVectorLocalViewTypeX,
1013 typename MultiVectorLocalViewTypeX_Remote>
1014 void run(const vector_type_3d_view &y_packed_,
1015 const MultiVectorLocalViewTypeB &b_,
1016 const MultiVectorLocalViewTypeX &x_,
1017 const MultiVectorLocalViewTypeX_Remote &x_remote_,
1018 const bool compute_owned) {
1019 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
1020 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<OverlapTag>", ComputeResidual0, execution_space);
1021
1022 b = b_;
1023 x = x_;
1024 x_remote = x_remote_;
1025 if constexpr (is_device<execution_space>::value) {
1026 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type *)y_packed_.data(),
1027 y_packed_.extent(0),
1028 y_packed_.extent(1),
1029 y_packed_.extent(2),
1030 vector_length);
1031 } else {
1032 y_packed = y_packed_;
1033 }
1034
1035 if constexpr (is_device<execution_space>::value) {
1036 const local_ordinal_type blocksize = blocksize_requested;
1037 // local_ordinal_type vl_power_of_two = 1;
1038 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
1039 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
1040 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
1041#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1042 if (this->hasBlockCrsMatrix) { \
1043 const local_ordinal_type team_size = 8; \
1044 const local_ordinal_type vector_size = 8; \
1045 const size_t shmem_team_size = blocksize * sizeof(btdm_scalar_type); \
1046 const size_t shmem_thread_size = blocksize * sizeof(btdm_scalar_type); \
1047 if (compute_owned) { \
1048 Kokkos::TeamPolicy<execution_space, OverlapTag<0, B, true>> \
1049 policy(rowidx2part.extent(0), team_size, vector_size); \
1050 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
1051 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1052 } else { \
1053 Kokkos::TeamPolicy<execution_space, OverlapTag<1, B, true>> \
1054 policy(rowidx2part.extent(0), team_size, vector_size); \
1055 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
1056 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1057 } \
1058 } else { \
1059 const local_ordinal_type team_size = 8; \
1060 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
1061 if (compute_owned) { \
1062 const Kokkos::TeamPolicy<execution_space, OverlapTag<0, B, false>> \
1063 policy(rowidx2part.extent(0), team_size, vector_size); \
1064 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1065 } else { \
1066 const Kokkos::TeamPolicy<execution_space, OverlapTag<1, B, false>> \
1067 policy(rowidx2part.extent(0), team_size, vector_size); \
1068 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1069 } \
1070 } \
1071 break
1072 switch (blocksize_requested) {
1073 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
1074 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
1075 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
1076 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
1077 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1078 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1079 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1080 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1081 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1082 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
1083 }
1084#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1085 } else {
1086#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1087 if (this->hasBlockCrsMatrix) { \
1088 if (compute_owned) { \
1089 const Kokkos::RangePolicy<execution_space, OverlapTag<0, B, true>> \
1090 policy(0, rowidx2part.extent(0)); \
1091 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1092 } else { \
1093 const Kokkos::RangePolicy<execution_space, OverlapTag<1, B, true>> \
1094 policy(0, rowidx2part.extent(0)); \
1095 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1096 } \
1097 } else { \
1098 if (compute_owned) { \
1099 const Kokkos::RangePolicy<execution_space, OverlapTag<0, B, false>> \
1100 policy(0, rowidx2part.extent(0)); \
1101 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1102 } else { \
1103 const Kokkos::RangePolicy<execution_space, OverlapTag<1, B, false>> \
1104 policy(0, rowidx2part.extent(0)); \
1105 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1106 } \
1107 } \
1108 break
1109
1110 switch (blocksize_requested) {
1111 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
1112 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
1113 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
1114 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
1115 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1116 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1117 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1118 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1119 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1120 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
1121 }
1122#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1123 }
1124 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
1125 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
1126 }
1127};
1128
1129} // namespace BlockHelperDetails
1130
1131} // namespace Ifpack2
1132
1133#endif
Preconditioners and smoothers for Tpetra sparse matrices.
Definition Ifpack2_AdditiveSchwarz_decl.hpp:40
Definition Ifpack2_BlockComputeResidualVector.hpp:23
Definition Ifpack2_BlockHelper.hpp:270
size_t size_type
Definition Ifpack2_BlockHelper.hpp:274
Kokkos::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition Ifpack2_BlockHelper.hpp:286
node_type::device_type node_device_type
Definition Ifpack2_BlockHelper.hpp:309
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition Ifpack2_BlockHelper.hpp:358