Ifpack2 Templated Preconditioning Package Version 1.0
Loading...
Searching...
No Matches
Ifpack2_BlockComputeResidualVector_def.hpp
1// @HEADER
2// *****************************************************************************
3// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4//
5// Copyright 2009 NTESS and the Ifpack2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef IFPACK2_BLOCKCOMPUTERES_VECTOR_DEF_HPP
11#define IFPACK2_BLOCKCOMPUTERES_VECTOR_DEF_HPP
12
13#include "Ifpack2_BlockComputeResidualVector_decl.hpp"
14
15namespace Ifpack2::BlockHelperDetails {
16
17// Precompute offsets of each A and x entry to speed up residual.
18// (Applies for hasBlockCrsMatrix == true and OverlapTag/AsyncTag)
19// Reading A, x take up to 4, 6 levels of indirection respectively,
20// but precomputing the offsets reduces it to 2 for both.
21//
22// This function allocates and populates these members of AmD:
23// A_x_offsets, A_x_offsets_remote
24template <typename MatrixType>
25void ComputeResidualVector<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::precompute_A_x_offsets(
26 AmD<MatrixType> &amd,
27 const PartInterface<MatrixType> &interf,
28 const Teuchos::RCP<const typename impl_type::tpetra_crs_graph_type> &g,
29 const local_ordinal_type_1d_view &dm2cm,
30 int blocksize,
31 bool ownedRemoteSeparate) {
32 using impl_type = ImplType<MatrixType>;
33 using i64_3d_view = typename impl_type::i64_3d_view;
34 using size_type = typename impl_type::size_type;
35 using local_ordinal_type = typename impl_type::local_ordinal_type;
36 using execution_space = typename impl_type::execution_space;
37 auto local_graph = g->getLocalGraphDevice();
38 const auto A_block_rowptr = local_graph.row_map;
39 const auto A_colind = local_graph.entries;
40 local_ordinal_type numLocalRows = interf.rowidx2part.extent(0);
41 int blocksize_square = blocksize * blocksize;
42 // shallow-copying views to avoid capturing the amd, interf objects in lambdas
43 auto lclrow = interf.lclrow;
44 auto A_colindsub = amd.A_colindsub;
45 auto A_colindsub_remote = amd.A_colindsub_remote;
46 auto rowptr = amd.rowptr;
47 auto rowptr_remote = amd.rowptr_remote;
48 bool is_dm2cm_active = dm2cm.extent(0);
49 if (ownedRemoteSeparate) {
50 // amd.rowptr points to owned entries only, and amd.rowptr_remote points to nonowned.
51 local_ordinal_type maxOwnedEntriesPerRow = 0;
52 local_ordinal_type maxNonownedEntriesPerRow = 0;
53 Kokkos::parallel_reduce(
54 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
55 KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type & lmaxOwned, local_ordinal_type & lmaxNonowned) {
56 const local_ordinal_type lr = lclrow(i);
57 local_ordinal_type rowNumOwned = rowptr(lr + 1) - rowptr(lr);
58 if (rowNumOwned > lmaxOwned)
59 lmaxOwned = rowNumOwned;
60 // rowptr_remote won't be allocated for single-rank problems
61 if (rowptr_remote.extent(0)) {
62 local_ordinal_type rowNumNonowned = rowptr_remote(lr + 1) - rowptr_remote(lr);
63 if (rowNumNonowned > lmaxNonowned)
64 lmaxNonowned = rowNumNonowned;
65 } else {
66 lmaxNonowned = 0;
67 }
68 },
69 Kokkos::Max<local_ordinal_type>(maxOwnedEntriesPerRow), Kokkos::Max<local_ordinal_type>(maxNonownedEntriesPerRow));
70 // Allocate the two offsets views now that we know the dimensions
71 // For each one, the middle dimension is 0 for A offsets and 1 for x offsets.
72 // Packing them together in one view improves cache line utilization
73 amd.A_x_offsets = i64_3d_view("amd.A_x_offsets", numLocalRows, 2, maxOwnedEntriesPerRow);
74 amd.A_x_offsets_remote = i64_3d_view("amd.A_x_offsets_remote", numLocalRows, 2, maxNonownedEntriesPerRow);
75 auto A_x_offsets = amd.A_x_offsets;
76 auto A_x_offsets_remote = amd.A_x_offsets_remote;
77 // Now, populate all the offsets. Use ArithTraits<int64_t>::min to mark absent entries.
78 Kokkos::parallel_for(
79 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
80 KOKKOS_LAMBDA(local_ordinal_type i) {
81 const local_ordinal_type lr = lclrow(i);
82 const size_type A_k0 = A_block_rowptr(lr);
83 // Owned entries
84 size_type rowBegin = rowptr(lr);
85 local_ordinal_type rowNumOwned = rowptr(lr + 1) - rowBegin;
86 for (local_ordinal_type entry = 0; entry < maxOwnedEntriesPerRow; entry++) {
87 if (entry < rowNumOwned) {
88 const size_type j = A_k0 + A_colindsub(rowBegin + entry);
89 const local_ordinal_type A_colind_at_j = A_colind(j);
90 const local_ordinal_type loc = is_dm2cm_active ? dm2cm(A_colind_at_j) : A_colind_at_j;
91 A_x_offsets(i, 0, entry) = int64_t(j) * blocksize_square;
92 A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
93 } else {
94 A_x_offsets(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
95 A_x_offsets(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
96 }
97 }
98 // Nonowned entries
99 if (rowptr_remote.extent(0)) {
100 rowBegin = rowptr_remote(lr);
101 local_ordinal_type rowNumNonowned = rowptr_remote(lr + 1) - rowBegin;
102 for (local_ordinal_type entry = 0; entry < maxNonownedEntriesPerRow; entry++) {
103 if (entry < rowNumNonowned) {
104 const size_type j = A_k0 + A_colindsub_remote(rowBegin + entry);
105 const local_ordinal_type A_colind_at_j = A_colind(j);
106 const local_ordinal_type loc = A_colind_at_j - numLocalRows;
107 A_x_offsets_remote(i, 0, entry) = int64_t(j) * blocksize_square;
108 A_x_offsets_remote(i, 1, entry) = int64_t(loc) * blocksize;
109 } else {
110 A_x_offsets_remote(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
111 A_x_offsets_remote(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
112 }
113 }
114 }
115 });
116 } else {
117 // amd.rowptr points to both owned and nonowned entries, so it tells us how many columns (last dim) A_x_offsets should have.
118 local_ordinal_type maxEntriesPerRow = 0;
119 Kokkos::parallel_reduce(
120 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
121 KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type & lmax) {
122 const local_ordinal_type lr = lclrow(i);
123 local_ordinal_type rowNum = rowptr(lr + 1) - rowptr(lr);
124 if (rowNum > lmax)
125 lmax = rowNum;
126 },
127 Kokkos::Max<local_ordinal_type>(maxEntriesPerRow));
128 amd.A_x_offsets = i64_3d_view("amd.A_x_offsets", numLocalRows, 2, maxEntriesPerRow);
129 auto A_x_offsets = amd.A_x_offsets;
130 // Populate A,x offsets. Use ArithTraits<int64_t>::min to mark absent entries.
131 // For x offsets, add a shift blocksize*numLocalRows to represent that it indexes into x_remote instead of x.
132 Kokkos::parallel_for(
133 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
134 KOKKOS_LAMBDA(local_ordinal_type i) {
135 const local_ordinal_type lr = lclrow(i);
136 const size_type A_k0 = A_block_rowptr(lr);
137 // Owned entries
138 size_type rowBegin = rowptr(lr);
139 local_ordinal_type rowOwned = rowptr(lr + 1) - rowBegin;
140 for (local_ordinal_type entry = 0; entry < maxEntriesPerRow; entry++) {
141 if (entry < rowOwned) {
142 const size_type j = A_k0 + A_colindsub(rowBegin + entry);
143 A_x_offsets(i, 0, entry) = j * blocksize_square;
144 const local_ordinal_type A_colind_at_j = A_colind(j);
145 if (A_colind_at_j < numLocalRows) {
146 const local_ordinal_type loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
147 A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
148 } else {
149 A_x_offsets(i, 1, entry) = int64_t(A_colind_at_j) * blocksize;
150 }
151 } else {
152 A_x_offsets(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
153 A_x_offsets(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
154 }
155 }
156 });
157 }
158}
159
163static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
164 const int team_size) {
165 int total_team_size(0);
166 if (blksize <= 5)
167 total_team_size = 32;
168 else if (blksize <= 9)
169 total_team_size = 32; // 64
170 else if (blksize <= 12)
171 total_team_size = 96;
172 else if (blksize <= 16)
173 total_team_size = 128;
174 else if (blksize <= 20)
175 total_team_size = 160;
176 else
177 total_team_size = 160;
178 return total_team_size / team_size;
179}
180
181static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
182 const int team_size) {
183 int total_team_size(0);
184 if (blksize <= 5)
185 total_team_size = 32;
186 else if (blksize <= 9)
187 total_team_size = 32; // 64
188 else if (blksize <= 12)
189 total_team_size = 96;
190 else if (blksize <= 16)
191 total_team_size = 128;
192 else if (blksize <= 20)
193 total_team_size = 160;
194 else
195 total_team_size = 160;
196 return total_team_size / team_size;
197}
198
199static inline int ComputeResidualVectorRecommendedSYCLVectorSize(const int blksize,
200 const int team_size) {
201 int total_team_size(0);
202 if (blksize <= 5)
203 total_team_size = 32;
204 else if (blksize <= 9)
205 total_team_size = 32; // 64
206 else if (blksize <= 12)
207 total_team_size = 96;
208 else if (blksize <= 16)
209 total_team_size = 128;
210 else if (blksize <= 20)
211 total_team_size = 160;
212 else
213 total_team_size = 160;
214 return total_team_size / team_size;
215}
216
217template <typename T>
218static inline int ComputeResidualVectorRecommendedVectorSize(const int blksize,
219 const int team_size) {
220 if (is_cuda<T>::value)
221 return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
222 if (is_hip<T>::value)
223 return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
224 if (is_sycl<T>::value)
225 return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
226 return -1;
227}
228
229template <typename MatrixType>
230struct ComputeResidualFunctor {
231 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
232 using node_device_type = typename impl_type::node_device_type;
233 using execution_space = typename impl_type::execution_space;
234 using memory_space = typename impl_type::memory_space;
235
236 using local_ordinal_type = typename impl_type::local_ordinal_type;
237 using size_type = typename impl_type::size_type;
238 using impl_scalar_type = typename impl_type::impl_scalar_type;
239 using magnitude_type = typename impl_type::magnitude_type;
240 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
241 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
243 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
244 using size_type_1d_view = typename impl_type::size_type_1d_view;
245 using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
246 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
247 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
248 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
249 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
250 using i64_3d_view = typename impl_type::i64_3d_view;
251 static constexpr int vector_length = impl_type::vector_length;
252
254 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
255
256 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
257 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
258 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
259 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
260 Unmanaged<vector_type_3d_view> y_packed;
261 Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
262
263 // AmD information
264 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
265 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
266 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
267
268 // block crs graph information
269 // for cuda (kokkos crs graph uses a different size_type from size_t)
270 const ConstUnmanaged<Kokkos::View<size_t *, node_device_type>> A_block_rowptr;
271 const ConstUnmanaged<Kokkos::View<size_t *, node_device_type>> A_point_rowptr;
272 const ConstUnmanaged<Kokkos::View<local_ordinal_type *, node_device_type>> A_colind;
273
274 // blocksize
275 const local_ordinal_type blocksize_requested;
276
277 // part interface
278 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
279 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
280 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
281 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
282 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
283 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
284
285 // block offsets
286 const ConstUnmanaged<i64_3d_view> A_x_offsets;
287 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
288
289 const bool is_dm2cm_active;
290 const bool hasBlockCrsMatrix;
291
292 ComputeResidualFunctor(const ComputeResidualVector<MatrixType> &crv)
293 : rowptr(crv.rowptr)
294 , rowptr_remote(crv.rowptr_remote)
295 , colindsub(crv.colindsub)
296 , colindsub_remote(crv.colindsub_remote)
297 , tpetra_values(crv.tpetra_values)
298 , A_block_rowptr(crv.A_block_rowptr)
299 , A_point_rowptr(crv.A_point_rowptr)
300 , A_colind(crv.A_colind)
301 , blocksize_requested(crv.blocksize_requested)
302 , part2packrowidx0(crv.part2packrowidx0)
303 , part2rowidx0(crv.part2rowidx0)
304 , rowidx2part(crv.rowidx2part)
305 , partptr(crv.partptr)
306 , lclrow(crv.lclrow)
307 , dm2cm(crv.dm2cm)
308 , A_x_offsets(crv.A_x_offsets)
309 , A_x_offsets_remote(crv.A_x_offsets_remote)
310 , is_dm2cm_active(crv.is_dm2cm_active)
311 , hasBlockCrsMatrix(crv.hasBlockCrsMatrix) {}
312
313 inline void
314 SerialDot(const local_ordinal_type &blocksize,
315 const local_ordinal_type &lclRowID,
316 const local_ordinal_type &lclColID,
317 const local_ordinal_type &ii,
318 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub_,
319 const impl_scalar_type *const KOKKOS_RESTRICT xx,
320 /* */ impl_scalar_type *KOKKOS_RESTRICT yy) const {
321 const size_type Aj_c = colindsub_(lclColID);
322 auto point_row_offset = A_point_rowptr(lclRowID * blocksize + ii) + Aj_c * blocksize;
323 impl_scalar_type val = 0;
324#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
325#pragma ivdep
326#endif
327#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
328#pragma unroll
329#endif
330 for (local_ordinal_type k1 = 0; k1 < blocksize; ++k1)
331 val += tpetra_values(point_row_offset + k1) * xx[k1];
332 yy[ii] -= val;
333 }
334
335 inline void
336 SerialGemv(const local_ordinal_type &blocksize,
337 const impl_scalar_type *const KOKKOS_RESTRICT AA,
338 const impl_scalar_type *const KOKKOS_RESTRICT xx,
339 /* */ impl_scalar_type *KOKKOS_RESTRICT yy) const {
340 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
341 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0) {
342 impl_scalar_type val = 0;
343#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
344#pragma ivdep
345#endif
346#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
347#pragma unroll
348#endif
349 for (local_ordinal_type k1 = 0; k1 < blocksize; ++k1)
350 val += AA[tlb::getFlatIndex(k0, k1, blocksize)] * xx[k1];
351 yy[k0] -= val;
352 }
353 }
354
355 template <typename bbViewType, typename yyViewType>
356 KOKKOS_INLINE_FUNCTION void
357 VectorCopy(const member_type &member,
358 const local_ordinal_type &blocksize,
359 const bbViewType &bb,
360 const yyViewType &yy) const {
361 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
362 yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
363 });
364 }
365
366 template <typename xxViewType, typename yyViewType>
367 KOKKOS_INLINE_FUNCTION void
368 VectorDot(const member_type &member,
369 const local_ordinal_type &blocksize,
370 const local_ordinal_type &lclRowID,
371 const local_ordinal_type &lclColID,
372 const local_ordinal_type &ii,
373 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub_,
374 const xxViewType &xx,
375 const yyViewType &yy) const {
376 const size_type Aj_c = colindsub_(lclColID);
377 auto point_row_offset = A_point_rowptr(lclRowID * blocksize + ii) + Aj_c * blocksize;
378 impl_scalar_type val = 0;
379 Kokkos::parallel_reduce(
380 Kokkos::ThreadVectorRange(member, blocksize),
381 [&](const local_ordinal_type &k1, impl_scalar_type &update) {
382 update += tpetra_values(point_row_offset + k1) * xx(k1);
383 },
384 val);
385 Kokkos::single(Kokkos::PerThread(member),
386 [=]() {
387 Kokkos::atomic_add(&yy(ii), typename yyViewType::const_value_type(-val));
388 });
389 }
390
391 // BMK: This version coalesces accesses to AA for LayoutRight blocks.
392 template <typename AAViewType, typename xxViewType, typename yyViewType>
393 KOKKOS_INLINE_FUNCTION void
394 VectorGemv(const member_type &member,
395 const local_ordinal_type &blocksize,
396 const AAViewType &AA,
397 const xxViewType &xx,
398 const yyViewType &yy) const {
399 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0) {
400 impl_scalar_type val = 0;
401 Kokkos::parallel_reduce(
402 Kokkos::ThreadVectorRange(member, blocksize),
403 [&](const local_ordinal_type &k1, impl_scalar_type &update) {
404 update += AA(k0, k1) * xx(k1);
405 },
406 val);
407 Kokkos::single(Kokkos::PerThread(member),
408 [=]() {
409 Kokkos::atomic_add(&yy(k0), -val);
410 });
411 }
412 }
413
414 struct SeqTag {};
415
416 void
417 operator()(const SeqTag &, const local_ordinal_type &i) const {
418 const local_ordinal_type blocksize = blocksize_requested;
419 const local_ordinal_type blocksize_square = blocksize * blocksize;
420
421 // constants
422 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
423 const local_ordinal_type num_vectors = y.extent(1);
424 const local_ordinal_type row = i * blocksize;
425 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
426 // y := b
427 impl_scalar_type *yy = &y(row, col);
428 const impl_scalar_type *const bb = &b(row, col);
429 memcpy(yy, bb, sizeof(impl_scalar_type) * blocksize);
430
431 // y -= Rx
432 const size_type A_k0 = A_block_rowptr[i];
433 for (size_type k = rowptr[i]; k < rowptr[i + 1]; ++k) {
434 const size_type j = A_k0 + colindsub[k];
435 const impl_scalar_type *const xx = &x(A_colind[j] * blocksize, col);
436 if (hasBlockCrsMatrix) {
437 const impl_scalar_type *const AA = &tpetra_values(j * blocksize_square);
438 SerialGemv(blocksize, AA, xx, yy);
439 } else {
440 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
441 SerialDot(blocksize, i, k, k0, colindsub, xx, yy);
442 }
443 }
444 }
445 }
446
447 KOKKOS_INLINE_FUNCTION
448 void
449 operator()(const SeqTag &, const member_type &member) const {
450 // constants
451 const local_ordinal_type blocksize = blocksize_requested;
452 const local_ordinal_type blocksize_square = blocksize * blocksize;
453
454 const local_ordinal_type lr = member.league_rank();
455 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
456 const local_ordinal_type num_vectors = y.extent(1);
457
458 // subview pattern
459 auto bb = Kokkos::subview(b, block_range, 0);
460 auto xx = bb;
461 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
462
463 const local_ordinal_type row = lr * blocksize;
464 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
465 // y := b
466 auto yy = Kokkos::subview(y, Kokkos::make_pair(row, row + blocksize), col);
467 bb.assign_data(&b(row, col));
468 if (member.team_rank() == 0)
469 VectorCopy(member, blocksize, bb, yy);
470 member.team_barrier();
471
472 // y -= Rx
473 const size_type A_k0 = A_block_rowptr[lr];
474
475 if (hasBlockCrsMatrix) {
476 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr + 1]),
477 [&](const local_ordinal_type &k) {
478 const size_type j = A_k0 + colindsub[k];
479 xx.assign_data(&x(A_colind[j] * blocksize, col));
480 A_block_cst.assign_data(&tpetra_values(j * blocksize_square));
481 VectorGemv(member, blocksize, A_block_cst, xx, yy);
482 });
483 } else {
484 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr + 1]),
485 [&](const local_ordinal_type &k) {
486 const size_type j = A_k0 + colindsub[k];
487 xx.assign_data(&x(A_colind[j] * blocksize, col));
488
489 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
490 VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
491 });
492 }
493 }
494 }
495
496 // * B: block size for compile-time specialization, or 0 for general case (up to max_blocksize)
497 // * async: true if using async importer. overlap is not used in this case.
498 // Whether a column is owned or nonowned is decided at runtime.
499 // * overlap: true if processing the columns that are not locally owned,
500 // false if processing locally owned columns.
501 // * haveBlockMatrix: true if A is a BlockCrsMatrix, false if it's CrsMatrix.
502 template <int B, bool async, bool overlap, bool haveBlockMatrix>
503 struct GeneralTag {
504 static_assert(!(async && overlap),
505 "ComputeResidualVector: async && overlap is not a valid configuration for GeneralTag");
506 };
507
508 // Define AsyncTag and OverlapTag in terms of GeneralTag:
509 // P == 0 means only compute on owned columns
510 // P == 1 means only compute on nonowned columns
511 template <int P, int B, bool haveBlockMatrix>
512 using OverlapTag = GeneralTag<B, false, P != 0, haveBlockMatrix>;
513
514 template <int B, bool haveBlockMatrix>
515 using AsyncTag = GeneralTag<B, true, false, haveBlockMatrix>;
516
517 // CPU implementation for all cases
518 template <int B, bool async, bool overlap, bool haveBlockMatrix>
519 void
520 operator()(const GeneralTag<B, async, overlap, haveBlockMatrix> &, const local_ordinal_type &rowidx) const {
521 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
522
523 // constants
524 const local_ordinal_type partidx = rowidx2part(rowidx);
525 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
526 const local_ordinal_type v = partidx % vector_length;
527
528 const local_ordinal_type num_vectors = y_packed.extent(2);
529 const local_ordinal_type num_local_rows = lclrow.extent(0);
530
531 // temporary buffer for y flat
532 impl_scalar_type yy[B == 0 ? impl_type::max_blocksize : B] = {};
533
534 const local_ordinal_type lr = lclrow(rowidx);
535
536 auto colindsub_used = overlap ? colindsub_remote : colindsub;
537 auto rowptr_used = overlap ? rowptr_remote : rowptr;
538
539 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
540 if constexpr (overlap) {
541 // y (temporary) := 0
542 memset((void *)yy, 0, sizeof(impl_scalar_type) * blocksize);
543 } else {
544 // y := b
545 const local_ordinal_type row = lr * blocksize;
546 memcpy(yy, &b(row, col), sizeof(impl_scalar_type) * blocksize);
547 }
548
549 // y -= Rx
550 const size_type A_k0 = A_block_rowptr[lr];
551 for (size_type k = rowptr_used[lr]; k < rowptr_used[lr + 1]; ++k) {
552 const size_type j = A_k0 + colindsub_used[k];
553 const local_ordinal_type A_colind_at_j = A_colind[j];
554 if constexpr (haveBlockMatrix) {
555 const local_ordinal_type blocksize_square = blocksize * blocksize;
556 const impl_scalar_type *const AA = &tpetra_values(j * blocksize_square);
557 if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
558 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
559 const impl_scalar_type *const xx = &x(loc * blocksize, col);
560 SerialGemv(blocksize, AA, xx, yy);
561 } else {
562 const auto loc = A_colind_at_j - num_local_rows;
563 const impl_scalar_type *const xx_remote = &x_remote(loc * blocksize, col);
564 SerialGemv(blocksize, AA, xx_remote, yy);
565 }
566 } else {
567 if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
568 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
569 const impl_scalar_type *const xx = &x(loc * blocksize, col);
570 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
571 SerialDot(blocksize, lr, k, k0, colindsub_used, xx, yy);
572 } else {
573 const auto loc = A_colind_at_j - num_local_rows;
574 const impl_scalar_type *const xx_remote = &x_remote(loc * blocksize, col);
575 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
576 SerialDot(blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
577 }
578 }
579 }
580 // move yy to y_packed
581 if constexpr (overlap) {
582 for (local_ordinal_type k = 0; k < blocksize; ++k)
583 y_packed(pri, k, col)[v] += yy[k];
584 } else {
585 for (local_ordinal_type k = 0; k < blocksize; ++k)
586 y_packed(pri, k, col)[v] = yy[k];
587 }
588 }
589 }
590
591 // GPU implementation for hasBlockCrsMatrix == true
592 template <int B, bool async, bool overlap>
593 KOKKOS_INLINE_FUNCTION void
594 operator()(const GeneralTag<B, async, overlap, true> &, const member_type &member) const {
595 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
596
597 // constants
598 const local_ordinal_type rowidx = member.league_rank();
599 const local_ordinal_type partidx = rowidx2part(rowidx);
600 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
601 const local_ordinal_type v = partidx % vector_length;
602
603 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
604 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
605 const local_ordinal_type num_local_rows = lclrow.extent(0);
606
607 // subview pattern
608 auto bb = Kokkos::subview(b, block_range, 0);
609 auto xx = bb;
610 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
611 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
612
613 // Get shared allocation for a local copy of x, Ax, and A
614 impl_scalar_type *local_Ax = reinterpret_cast<impl_scalar_type *>(member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
615 impl_scalar_type *local_x = reinterpret_cast<impl_scalar_type *>(member.thread_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
616
617 const local_ordinal_type lr = lclrow(rowidx);
618 const local_ordinal_type row = lr * blocksize;
619 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
620 if (col)
621 member.team_barrier();
622 // y -= Rx
623 // Initialize accumulation array
624 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), [&](const local_ordinal_type &i) {
625 local_Ax[i] = 0;
626 });
627 member.team_barrier();
628
629 int numEntries;
630 if constexpr (!overlap) {
631 numEntries = A_x_offsets.extent(2);
632 } else {
633 numEntries = A_x_offsets_remote.extent(2);
634 }
635
636 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numEntries),
637 [&](const int k) {
638 int64_t A_offset = overlap ? A_x_offsets_remote(rowidx, 0, k) : A_x_offsets(rowidx, 0, k);
639 int64_t x_offset = overlap ? A_x_offsets_remote(rowidx, 1, k) : A_x_offsets(rowidx, 1, k);
640 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
641 A_block_cst.assign_data(tpetra_values.data() + A_offset);
642 // Pull x into local memory
643 if constexpr (async) {
644 size_type remote_cutoff = blocksize * num_local_rows;
645 if (x_offset >= remote_cutoff)
646 xx.assign_data(&x_remote(x_offset - remote_cutoff, col));
647 else
648 xx.assign_data(&x(x_offset, col));
649 } else {
650 if constexpr (!overlap) {
651 xx.assign_data(&x(x_offset, col));
652 } else {
653 xx.assign_data(&x_remote(x_offset, col));
654 }
655 }
656
657 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &i) {
658 local_x[i] = xx(i);
659 });
660
661 // MatVec op Ax += A*x
662 Kokkos::parallel_for(
663 Kokkos::ThreadVectorRange(member, blocksize),
664 [&](const local_ordinal_type &k0) {
665 impl_scalar_type val = 0;
666 for (int k1 = 0; k1 < blocksize; k1++)
667 val += A_block_cst(k0, k1) * local_x[k1];
668 Kokkos::atomic_add(local_Ax + k0, val);
669 });
670 }
671 });
672 member.team_barrier();
673 // Update y = b - local_Ax
674 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
675 bb.assign_data(&b(row, col));
676 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), [&](const local_ordinal_type &i) {
677 if (!overlap)
678 yy(i) = bb(i) - local_Ax[i];
679 else
680 yy(i) -= local_Ax[i];
681 });
682 }
683 }
684
685 // GPU implementation for hasBlockCrsMatrix == false
686 template <int B, bool async, bool overlap>
687 KOKKOS_INLINE_FUNCTION void
688 operator()(const GeneralTag<B, async, overlap, false> &, const member_type &member) const {
689 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
690
691 // constants
692 const local_ordinal_type rowidx = member.league_rank();
693 const local_ordinal_type partidx = rowidx2part(rowidx);
694 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
695 const local_ordinal_type v = partidx % vector_length;
696
697 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
698 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
699 const local_ordinal_type num_local_rows = lclrow.extent(0);
700
701 // subview pattern
702 auto bb = Kokkos::subview(b, block_range, 0);
703 auto xx = bb;
704 auto xx_remote = bb;
705 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
706 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
707 auto colindsub_used = overlap ? colindsub_remote : colindsub;
708 auto rowptr_used = overlap ? rowptr_remote : rowptr;
709
710 const local_ordinal_type lr = lclrow(rowidx);
711 const local_ordinal_type row = lr * blocksize;
712 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
713 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
714 if (!overlap) {
715 // y := b
716 bb.assign_data(&b(row, col));
717 if (member.team_rank() == 0)
718 VectorCopy(member, blocksize, bb, yy);
719 member.team_barrier();
720 }
721
722 // y -= Rx
723 const size_type A_k0 = A_block_rowptr[lr];
724 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr + 1]),
725 [&](const local_ordinal_type &k) {
726 const size_type j = A_k0 + colindsub_used[k];
727 const local_ordinal_type A_colind_at_j = A_colind[j];
728 if ((async && A_colind_at_j < num_local_rows) || (!async && !overlap)) {
729 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
730 xx.assign_data(&x(loc * blocksize, col));
731 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
732 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx, yy);
733 } else {
734 const auto loc = A_colind_at_j - num_local_rows;
735 xx_remote.assign_data(&x_remote(loc * blocksize, col));
736 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
737 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
738 }
739 });
740 }
741 }
742};
743
744// y = b - Rx; seq method
745template <typename MatrixType>
746void ComputeResidualVector<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::run(const impl_scalar_type_2d_view_tpetra &y_,
747 const Const<impl_scalar_type_2d_view_tpetra> &b_,
748 const impl_scalar_type_2d_view_tpetra &x_) {
749 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
750 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<SeqTag>", ComputeResidual0, execution_space);
751
752 using Functor = ComputeResidualFunctor<typename impl_type::matrix_type>;
753 Functor functor(*this);
754
755 functor.y = y_;
756 functor.b = b_;
757 functor.x = x_;
758 if constexpr (is_device<execution_space>::value) {
759 const local_ordinal_type blocksize = blocksize_requested;
760 const local_ordinal_type team_size = 8;
761 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
762 const Kokkos::TeamPolicy<execution_space, typename Functor::SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
763 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<SeqTag>", policy, functor);
764 } else {
765 const Kokkos::RangePolicy<execution_space, typename Functor::SeqTag> policy(0, rowptr.extent(0) - 1);
766 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<SeqTag>", policy, functor);
767 }
768 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
769 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
770}
771
772// y = b - R (x , x_remote)
773template <typename MatrixType>
774void ComputeResidualVector<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::run(const vector_type_3d_view &y_packed_,
775 const Const<impl_scalar_type_2d_view_tpetra> &b_,
776 const impl_scalar_type_2d_view_tpetra &x_,
777 const impl_scalar_type_2d_view_tpetra &x_remote_) {
778 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
779 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<AsyncTag>", ComputeResidual0, execution_space);
780
781 using Functor = ComputeResidualFunctor<typename impl_type::matrix_type>;
782 Functor functor(*this);
783
784 functor.b = b_;
785 functor.x = x_;
786 functor.x_remote = x_remote_;
787 if constexpr (is_device<execution_space>::value) {
788 functor.y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type *)y_packed_.data(),
789 y_packed_.extent(0),
790 y_packed_.extent(1),
791 y_packed_.extent(2),
792 vector_length);
793 } else {
794 functor.y_packed = y_packed_;
795 }
796
797 if constexpr (is_device<execution_space>::value) {
798 const local_ordinal_type blocksize = blocksize_requested;
799 // local_ordinal_type vl_power_of_two = 1;
800 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
801 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
802 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
803#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
804 { \
805 if (this->hasBlockCrsMatrix) { \
806 const local_ordinal_type team_size = 8; \
807 const local_ordinal_type vector_size = 8; \
808 const size_t shmem_team_size = blocksize * sizeof(btdm_scalar_type); \
809 const size_t shmem_thread_size = blocksize * sizeof(btdm_scalar_type); \
810 Kokkos::TeamPolicy<execution_space, typename Functor::template AsyncTag<B, true>> \
811 policy(rowidx2part.extent(0), team_size, vector_size); \
812 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
813 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<AsyncTag>", \
814 policy, functor); \
815 } else { \
816 const local_ordinal_type team_size = 8; \
817 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
818 const Kokkos::TeamPolicy<execution_space, typename Functor::template AsyncTag<B, false>> \
819 policy(rowidx2part.extent(0), team_size, vector_size); \
820 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<AsyncTag>", \
821 policy, functor); \
822 } \
823 } \
824 break
825 switch (blocksize_requested) {
826 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
827 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
828 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
829 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
830 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
831 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
832 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
833 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
834 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
835 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
836 }
837#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
838 } else {
839#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
840 { \
841 if (this->hasBlockCrsMatrix) { \
842 const Kokkos::RangePolicy<execution_space, typename Functor::template AsyncTag<B, true>> policy(0, rowidx2part.extent(0)); \
843 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<AsyncTag>", \
844 policy, functor); \
845 } else { \
846 const Kokkos::RangePolicy<execution_space, typename Functor::template AsyncTag<B, false>> policy(0, rowidx2part.extent(0)); \
847 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<AsyncTag>", \
848 policy, functor); \
849 } \
850 } \
851 break
852
853 switch (blocksize_requested) {
854 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
855 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
856 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
857 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
858 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
859 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
860 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
861 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
862 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
863 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
864 }
865#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
866 }
867 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
868 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
869}
870
871// y = b - R (y , y_remote)
872template <typename MatrixType>
873void ComputeResidualVector<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::run(const vector_type_3d_view &y_packed_,
874 const Const<impl_scalar_type_2d_view_tpetra> &b_,
875 const impl_scalar_type_2d_view_tpetra &x_,
876 const impl_scalar_type_2d_view_tpetra &x_remote_,
877 const bool compute_owned) {
878 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
879 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<OverlapTag>", ComputeResidual0, execution_space);
880
881 using Functor = ComputeResidualFunctor<typename impl_type::matrix_type>;
882 Functor functor(*this);
883
884 functor.b = b_;
885 functor.x = x_;
886 functor.x_remote = x_remote_;
887 if constexpr (is_device<execution_space>::value) {
888 functor.y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type *)y_packed_.data(),
889 y_packed_.extent(0),
890 y_packed_.extent(1),
891 y_packed_.extent(2),
892 vector_length);
893 } else {
894 functor.y_packed = y_packed_;
895 }
896
897 if constexpr (is_device<execution_space>::value) {
898 const local_ordinal_type blocksize = blocksize_requested;
899 // local_ordinal_type vl_power_of_two = 1;
900 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
901 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
902 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
903#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
904 if (this->hasBlockCrsMatrix) { \
905 const local_ordinal_type team_size = 8; \
906 const local_ordinal_type vector_size = 8; \
907 const size_t shmem_team_size = blocksize * sizeof(btdm_scalar_type); \
908 const size_t shmem_thread_size = blocksize * sizeof(btdm_scalar_type); \
909 if (compute_owned) { \
910 Kokkos::TeamPolicy<execution_space, typename Functor::template OverlapTag<0, B, true>> \
911 policy(rowidx2part.extent(0), team_size, vector_size); \
912 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
913 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, functor); \
914 } else { \
915 Kokkos::TeamPolicy<execution_space, typename Functor::template OverlapTag<1, B, true>> \
916 policy(rowidx2part.extent(0), team_size, vector_size); \
917 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
918 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, functor); \
919 } \
920 } else { \
921 const local_ordinal_type team_size = 8; \
922 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
923 if (compute_owned) { \
924 const Kokkos::TeamPolicy<execution_space, typename Functor::template OverlapTag<0, B, false>> \
925 policy(rowidx2part.extent(0), team_size, vector_size); \
926 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, functor); \
927 } else { \
928 const Kokkos::TeamPolicy<execution_space, typename Functor::template OverlapTag<1, B, false>> \
929 policy(rowidx2part.extent(0), team_size, vector_size); \
930 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, functor); \
931 } \
932 } \
933 break
934 switch (blocksize_requested) {
935 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
936 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
937 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
938 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
939 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
940 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
941 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
942 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
943 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
944 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
945 }
946#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
947 } else {
948#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
949 if (this->hasBlockCrsMatrix) { \
950 if (compute_owned) { \
951 const Kokkos::RangePolicy<execution_space, typename Functor::template OverlapTag<0, B, true>> \
952 policy(0, rowidx2part.extent(0)); \
953 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, functor); \
954 } else { \
955 const Kokkos::RangePolicy<execution_space, typename Functor::template OverlapTag<1, B, true>> \
956 policy(0, rowidx2part.extent(0)); \
957 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, functor); \
958 } \
959 } else { \
960 if (compute_owned) { \
961 const Kokkos::RangePolicy<execution_space, typename Functor::template OverlapTag<0, B, false>> \
962 policy(0, rowidx2part.extent(0)); \
963 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, functor); \
964 } else { \
965 const Kokkos::RangePolicy<execution_space, typename Functor::template OverlapTag<1, B, false>> \
966 policy(0, rowidx2part.extent(0)); \
967 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, functor); \
968 } \
969 } \
970 break
971
972 switch (blocksize_requested) {
973 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
974 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
975 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
976 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
977 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
978 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
979 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
980 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
981 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
982 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
983 }
984#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
985 }
986 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
987 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
988}
989
990} // namespace Ifpack2::BlockHelperDetails
991
992#define IFPACK2_BLOCKCOMPUTERESIDUALVECTOR_INSTANT(S, LO, GO, N) \
993 template class Ifpack2::BlockHelperDetails::ComputeResidualVector<Tpetra::RowMatrix<S, LO, GO, N>>;
994
995#endif
size_t size_type
Definition Ifpack2_BlockHelper.hpp:274
node_type::device_type node_device_type
Definition Ifpack2_BlockHelper.hpp:298
KokkosKernels::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition Ifpack2_BlockHelper.hpp:284
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition Ifpack2_BlockHelper.hpp:347