10#ifndef IFPACK2_BLOCKCOMPUTERES_VECTOR_DEF_HPP
11#define IFPACK2_BLOCKCOMPUTERES_VECTOR_DEF_HPP
13#include "Ifpack2_BlockComputeResidualVector_decl.hpp"
15namespace Ifpack2::BlockHelperDetails {
24template <
typename MatrixType>
25void ComputeResidualVector<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::precompute_A_x_offsets(
27 const PartInterface<MatrixType> &interf,
28 const Teuchos::RCP<const typename impl_type::tpetra_crs_graph_type> &g,
29 const local_ordinal_type_1d_view &dm2cm,
31 bool ownedRemoteSeparate) {
32 using impl_type = ImplType<MatrixType>;
33 using i64_3d_view =
typename impl_type::i64_3d_view;
34 using size_type =
typename impl_type::size_type;
35 using local_ordinal_type =
typename impl_type::local_ordinal_type;
36 using execution_space =
typename impl_type::execution_space;
37 auto local_graph = g->getLocalGraphDevice();
38 const auto A_block_rowptr = local_graph.row_map;
39 const auto A_colind = local_graph.entries;
40 local_ordinal_type numLocalRows = interf.rowidx2part.extent(0);
41 int blocksize_square = blocksize * blocksize;
43 auto lclrow = interf.lclrow;
44 auto A_colindsub = amd.A_colindsub;
45 auto A_colindsub_remote = amd.A_colindsub_remote;
46 auto rowptr = amd.rowptr;
47 auto rowptr_remote = amd.rowptr_remote;
48 bool is_dm2cm_active = dm2cm.extent(0);
49 if (ownedRemoteSeparate) {
51 local_ordinal_type maxOwnedEntriesPerRow = 0;
52 local_ordinal_type maxNonownedEntriesPerRow = 0;
53 Kokkos::parallel_reduce(
54 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
55 KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type & lmaxOwned, local_ordinal_type & lmaxNonowned) {
56 const local_ordinal_type lr = lclrow(i);
57 local_ordinal_type rowNumOwned = rowptr(lr + 1) - rowptr(lr);
58 if (rowNumOwned > lmaxOwned)
59 lmaxOwned = rowNumOwned;
61 if (rowptr_remote.extent(0)) {
62 local_ordinal_type rowNumNonowned = rowptr_remote(lr + 1) - rowptr_remote(lr);
63 if (rowNumNonowned > lmaxNonowned)
64 lmaxNonowned = rowNumNonowned;
69 Kokkos::Max<local_ordinal_type>(maxOwnedEntriesPerRow), Kokkos::Max<local_ordinal_type>(maxNonownedEntriesPerRow));
73 amd.A_x_offsets = i64_3d_view(
"amd.A_x_offsets", numLocalRows, 2, maxOwnedEntriesPerRow);
74 amd.A_x_offsets_remote = i64_3d_view(
"amd.A_x_offsets_remote", numLocalRows, 2, maxNonownedEntriesPerRow);
75 auto A_x_offsets = amd.A_x_offsets;
76 auto A_x_offsets_remote = amd.A_x_offsets_remote;
79 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
80 KOKKOS_LAMBDA(local_ordinal_type i) {
81 const local_ordinal_type lr = lclrow(i);
82 const size_type A_k0 = A_block_rowptr(lr);
84 size_type rowBegin = rowptr(lr);
85 local_ordinal_type rowNumOwned = rowptr(lr + 1) - rowBegin;
86 for (local_ordinal_type entry = 0; entry < maxOwnedEntriesPerRow; entry++) {
87 if (entry < rowNumOwned) {
88 const size_type j = A_k0 + A_colindsub(rowBegin + entry);
89 const local_ordinal_type A_colind_at_j = A_colind(j);
90 const local_ordinal_type loc = is_dm2cm_active ? dm2cm(A_colind_at_j) : A_colind_at_j;
91 A_x_offsets(i, 0, entry) = int64_t(j) * blocksize_square;
92 A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
94 A_x_offsets(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
95 A_x_offsets(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
99 if (rowptr_remote.extent(0)) {
100 rowBegin = rowptr_remote(lr);
101 local_ordinal_type rowNumNonowned = rowptr_remote(lr + 1) - rowBegin;
102 for (local_ordinal_type entry = 0; entry < maxNonownedEntriesPerRow; entry++) {
103 if (entry < rowNumNonowned) {
104 const size_type j = A_k0 + A_colindsub_remote(rowBegin + entry);
105 const local_ordinal_type A_colind_at_j = A_colind(j);
106 const local_ordinal_type loc = A_colind_at_j - numLocalRows;
107 A_x_offsets_remote(i, 0, entry) = int64_t(j) * blocksize_square;
108 A_x_offsets_remote(i, 1, entry) = int64_t(loc) * blocksize;
110 A_x_offsets_remote(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
111 A_x_offsets_remote(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
118 local_ordinal_type maxEntriesPerRow = 0;
119 Kokkos::parallel_reduce(
120 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
121 KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type & lmax) {
122 const local_ordinal_type lr = lclrow(i);
123 local_ordinal_type rowNum = rowptr(lr + 1) - rowptr(lr);
127 Kokkos::Max<local_ordinal_type>(maxEntriesPerRow));
128 amd.A_x_offsets = i64_3d_view(
"amd.A_x_offsets", numLocalRows, 2, maxEntriesPerRow);
129 auto A_x_offsets = amd.A_x_offsets;
132 Kokkos::parallel_for(
133 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
134 KOKKOS_LAMBDA(local_ordinal_type i) {
135 const local_ordinal_type lr = lclrow(i);
136 const size_type A_k0 = A_block_rowptr(lr);
138 size_type rowBegin = rowptr(lr);
139 local_ordinal_type rowOwned = rowptr(lr + 1) - rowBegin;
140 for (local_ordinal_type entry = 0; entry < maxEntriesPerRow; entry++) {
141 if (entry < rowOwned) {
142 const size_type j = A_k0 + A_colindsub(rowBegin + entry);
143 A_x_offsets(i, 0, entry) = j * blocksize_square;
144 const local_ordinal_type A_colind_at_j = A_colind(j);
145 if (A_colind_at_j < numLocalRows) {
146 const local_ordinal_type loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
147 A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
149 A_x_offsets(i, 1, entry) = int64_t(A_colind_at_j) * blocksize;
152 A_x_offsets(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
153 A_x_offsets(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
163static inline int ComputeResidualVectorRecommendedCudaVectorSize(
const int blksize,
164 const int team_size) {
165 int total_team_size(0);
167 total_team_size = 32;
168 else if (blksize <= 9)
169 total_team_size = 32;
170 else if (blksize <= 12)
171 total_team_size = 96;
172 else if (blksize <= 16)
173 total_team_size = 128;
174 else if (blksize <= 20)
175 total_team_size = 160;
177 total_team_size = 160;
178 return total_team_size / team_size;
181static inline int ComputeResidualVectorRecommendedHIPVectorSize(
const int blksize,
182 const int team_size) {
183 int total_team_size(0);
185 total_team_size = 32;
186 else if (blksize <= 9)
187 total_team_size = 32;
188 else if (blksize <= 12)
189 total_team_size = 96;
190 else if (blksize <= 16)
191 total_team_size = 128;
192 else if (blksize <= 20)
193 total_team_size = 160;
195 total_team_size = 160;
196 return total_team_size / team_size;
199static inline int ComputeResidualVectorRecommendedSYCLVectorSize(
const int blksize,
200 const int team_size) {
201 int total_team_size(0);
203 total_team_size = 32;
204 else if (blksize <= 9)
205 total_team_size = 32;
206 else if (blksize <= 12)
207 total_team_size = 96;
208 else if (blksize <= 16)
209 total_team_size = 128;
210 else if (blksize <= 20)
211 total_team_size = 160;
213 total_team_size = 160;
214 return total_team_size / team_size;
218static inline int ComputeResidualVectorRecommendedVectorSize(
const int blksize,
219 const int team_size) {
220 if (is_cuda<T>::value)
221 return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
222 if (is_hip<T>::value)
223 return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
224 if (is_sycl<T>::value)
225 return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
229template <
typename MatrixType>
230struct ComputeResidualFunctor {
231 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
233 using execution_space =
typename impl_type::execution_space;
234 using memory_space =
typename impl_type::memory_space;
236 using local_ordinal_type =
typename impl_type::local_ordinal_type;
239 using magnitude_type =
typename impl_type::magnitude_type;
240 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
241 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
243 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
245 using tpetra_block_access_view_type =
typename impl_type::tpetra_block_access_view_type;
246 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
247 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
248 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
249 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
250 using i64_3d_view =
typename impl_type::i64_3d_view;
251 static constexpr int vector_length = impl_type::vector_length;
254 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
256 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
257 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
258 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
259 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
260 Unmanaged<vector_type_3d_view> y_packed;
261 Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
264 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
265 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
266 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
270 const ConstUnmanaged<Kokkos::View<size_t *, node_device_type>> A_block_rowptr;
271 const ConstUnmanaged<Kokkos::View<size_t *, node_device_type>> A_point_rowptr;
272 const ConstUnmanaged<Kokkos::View<local_ordinal_type *, node_device_type>> A_colind;
275 const local_ordinal_type blocksize_requested;
278 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
279 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
280 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
281 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
282 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
283 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
286 const ConstUnmanaged<i64_3d_view> A_x_offsets;
287 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
289 const bool is_dm2cm_active;
290 const bool hasBlockCrsMatrix;
292 ComputeResidualFunctor(
const ComputeResidualVector<MatrixType> &crv)
294 , rowptr_remote(crv.rowptr_remote)
295 , colindsub(crv.colindsub)
296 , colindsub_remote(crv.colindsub_remote)
297 , tpetra_values(crv.tpetra_values)
298 , A_block_rowptr(crv.A_block_rowptr)
299 , A_point_rowptr(crv.A_point_rowptr)
300 , A_colind(crv.A_colind)
301 , blocksize_requested(crv.blocksize_requested)
302 , part2packrowidx0(crv.part2packrowidx0)
303 , part2rowidx0(crv.part2rowidx0)
304 , rowidx2part(crv.rowidx2part)
305 , partptr(crv.partptr)
308 , A_x_offsets(crv.A_x_offsets)
309 , A_x_offsets_remote(crv.A_x_offsets_remote)
310 , is_dm2cm_active(crv.is_dm2cm_active)
311 , hasBlockCrsMatrix(crv.hasBlockCrsMatrix) {}
314 SerialDot(
const local_ordinal_type &blocksize,
315 const local_ordinal_type &lclRowID,
316 const local_ordinal_type &lclColID,
317 const local_ordinal_type &ii,
318 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub_,
319 const impl_scalar_type *
const KOKKOS_RESTRICT xx,
320 impl_scalar_type *KOKKOS_RESTRICT yy)
const {
321 const size_type Aj_c = colindsub_(lclColID);
322 auto point_row_offset = A_point_rowptr(lclRowID * blocksize + ii) + Aj_c * blocksize;
323 impl_scalar_type val = 0;
324#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
327#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
330 for (local_ordinal_type k1 = 0; k1 < blocksize; ++k1)
331 val += tpetra_values(point_row_offset + k1) * xx[k1];
336 SerialGemv(
const local_ordinal_type &blocksize,
337 const impl_scalar_type *
const KOKKOS_RESTRICT AA,
338 const impl_scalar_type *
const KOKKOS_RESTRICT xx,
339 impl_scalar_type *KOKKOS_RESTRICT yy)
const {
340 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
341 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0) {
342 impl_scalar_type val = 0;
343#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
346#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
349 for (local_ordinal_type k1 = 0; k1 < blocksize; ++k1)
350 val += AA[tlb::getFlatIndex(k0, k1, blocksize)] * xx[k1];
355 template <
typename bbViewType,
typename yyViewType>
356 KOKKOS_INLINE_FUNCTION
void
357 VectorCopy(
const member_type &member,
358 const local_ordinal_type &blocksize,
359 const bbViewType &bb,
360 const yyViewType &yy)
const {
361 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](
const local_ordinal_type &k0) {
362 yy(k0) =
static_cast<typename yyViewType::const_value_type
>(bb(k0));
366 template <
typename xxViewType,
typename yyViewType>
367 KOKKOS_INLINE_FUNCTION
void
368 VectorDot(
const member_type &member,
369 const local_ordinal_type &blocksize,
370 const local_ordinal_type &lclRowID,
371 const local_ordinal_type &lclColID,
372 const local_ordinal_type &ii,
373 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub_,
374 const xxViewType &xx,
375 const yyViewType &yy)
const {
376 const size_type Aj_c = colindsub_(lclColID);
377 auto point_row_offset = A_point_rowptr(lclRowID * blocksize + ii) + Aj_c * blocksize;
378 impl_scalar_type val = 0;
379 Kokkos::parallel_reduce(
380 Kokkos::ThreadVectorRange(member, blocksize),
381 [&](
const local_ordinal_type &k1, impl_scalar_type &update) {
382 update += tpetra_values(point_row_offset + k1) * xx(k1);
385 Kokkos::single(Kokkos::PerThread(member),
387 Kokkos::atomic_add(&yy(ii),
typename yyViewType::const_value_type(-val));
392 template <
typename AAViewType,
typename xxViewType,
typename yyViewType>
393 KOKKOS_INLINE_FUNCTION
void
394 VectorGemv(
const member_type &member,
395 const local_ordinal_type &blocksize,
396 const AAViewType &AA,
397 const xxViewType &xx,
398 const yyViewType &yy)
const {
399 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0) {
400 impl_scalar_type val = 0;
401 Kokkos::parallel_reduce(
402 Kokkos::ThreadVectorRange(member, blocksize),
403 [&](
const local_ordinal_type &k1, impl_scalar_type &update) {
404 update += AA(k0, k1) * xx(k1);
407 Kokkos::single(Kokkos::PerThread(member),
409 Kokkos::atomic_add(&yy(k0), -val);
417 operator()(
const SeqTag &,
const local_ordinal_type &i)
const {
418 const local_ordinal_type blocksize = blocksize_requested;
419 const local_ordinal_type blocksize_square = blocksize * blocksize;
422 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
423 const local_ordinal_type num_vectors = y.extent(1);
424 const local_ordinal_type row = i * blocksize;
425 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
427 impl_scalar_type *yy = &y(row, col);
428 const impl_scalar_type *
const bb = &b(row, col);
429 memcpy(yy, bb,
sizeof(impl_scalar_type) * blocksize);
432 const size_type A_k0 = A_block_rowptr[i];
433 for (size_type k = rowptr[i]; k < rowptr[i + 1]; ++k) {
434 const size_type j = A_k0 + colindsub[k];
435 const impl_scalar_type *
const xx = &x(A_colind[j] * blocksize, col);
436 if (hasBlockCrsMatrix) {
437 const impl_scalar_type *
const AA = &tpetra_values(j * blocksize_square);
438 SerialGemv(blocksize, AA, xx, yy);
440 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
441 SerialDot(blocksize, i, k, k0, colindsub, xx, yy);
447 KOKKOS_INLINE_FUNCTION
449 operator()(
const SeqTag &,
const member_type &member)
const {
451 const local_ordinal_type blocksize = blocksize_requested;
452 const local_ordinal_type blocksize_square = blocksize * blocksize;
454 const local_ordinal_type lr = member.league_rank();
455 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
456 const local_ordinal_type num_vectors = y.extent(1);
459 auto bb = Kokkos::subview(b, block_range, 0);
461 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
463 const local_ordinal_type row = lr * blocksize;
464 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
466 auto yy = Kokkos::subview(y, Kokkos::make_pair(row, row + blocksize), col);
467 bb.assign_data(&b(row, col));
468 if (member.team_rank() == 0)
469 VectorCopy(member, blocksize, bb, yy);
470 member.team_barrier();
473 const size_type A_k0 = A_block_rowptr[lr];
475 if (hasBlockCrsMatrix) {
476 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr + 1]),
477 [&](
const local_ordinal_type &k) {
478 const size_type j = A_k0 + colindsub[k];
479 xx.assign_data(&x(A_colind[j] * blocksize, col));
480 A_block_cst.assign_data(&tpetra_values(j * blocksize_square));
481 VectorGemv(member, blocksize, A_block_cst, xx, yy);
484 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr + 1]),
485 [&](
const local_ordinal_type &k) {
486 const size_type j = A_k0 + colindsub[k];
487 xx.assign_data(&x(A_colind[j] * blocksize, col));
489 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
490 VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
502 template <
int B,
bool async,
bool overlap,
bool haveBlockMatrix>
504 static_assert(!(async && overlap),
505 "ComputeResidualVector: async && overlap is not a valid configuration for GeneralTag");
511 template <
int P,
int B,
bool haveBlockMatrix>
512 using OverlapTag = GeneralTag<B, false, P != 0, haveBlockMatrix>;
514 template <
int B,
bool haveBlockMatrix>
515 using AsyncTag = GeneralTag<B, true, false, haveBlockMatrix>;
518 template <
int B,
bool async,
bool overlap,
bool haveBlockMatrix>
520 operator()(
const GeneralTag<B, async, overlap, haveBlockMatrix> &,
const local_ordinal_type &rowidx)
const {
521 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
524 const local_ordinal_type partidx = rowidx2part(rowidx);
525 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
526 const local_ordinal_type v = partidx % vector_length;
528 const local_ordinal_type num_vectors = y_packed.extent(2);
529 const local_ordinal_type num_local_rows = lclrow.extent(0);
532 impl_scalar_type yy[B == 0 ? impl_type::max_blocksize : B] = {};
534 const local_ordinal_type lr = lclrow(rowidx);
536 auto colindsub_used = overlap ? colindsub_remote : colindsub;
537 auto rowptr_used = overlap ? rowptr_remote : rowptr;
539 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
540 if constexpr (overlap) {
542 memset((
void *)yy, 0,
sizeof(impl_scalar_type) * blocksize);
545 const local_ordinal_type row = lr * blocksize;
546 memcpy(yy, &b(row, col),
sizeof(impl_scalar_type) * blocksize);
550 const size_type A_k0 = A_block_rowptr[lr];
551 for (size_type k = rowptr_used[lr]; k < rowptr_used[lr + 1]; ++k) {
552 const size_type j = A_k0 + colindsub_used[k];
553 const local_ordinal_type A_colind_at_j = A_colind[j];
554 if constexpr (haveBlockMatrix) {
555 const local_ordinal_type blocksize_square = blocksize * blocksize;
556 const impl_scalar_type *
const AA = &tpetra_values(j * blocksize_square);
557 if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
558 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
559 const impl_scalar_type *
const xx = &x(loc * blocksize, col);
560 SerialGemv(blocksize, AA, xx, yy);
562 const auto loc = A_colind_at_j - num_local_rows;
563 const impl_scalar_type *
const xx_remote = &x_remote(loc * blocksize, col);
564 SerialGemv(blocksize, AA, xx_remote, yy);
567 if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
568 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
569 const impl_scalar_type *
const xx = &x(loc * blocksize, col);
570 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
571 SerialDot(blocksize, lr, k, k0, colindsub_used, xx, yy);
573 const auto loc = A_colind_at_j - num_local_rows;
574 const impl_scalar_type *
const xx_remote = &x_remote(loc * blocksize, col);
575 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
576 SerialDot(blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
581 if constexpr (overlap) {
582 for (local_ordinal_type k = 0; k < blocksize; ++k)
583 y_packed(pri, k, col)[v] += yy[k];
585 for (local_ordinal_type k = 0; k < blocksize; ++k)
586 y_packed(pri, k, col)[v] = yy[k];
592 template <
int B,
bool async,
bool overlap>
593 KOKKOS_INLINE_FUNCTION
void
594 operator()(
const GeneralTag<B, async, overlap, true> &,
const member_type &member)
const {
595 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
598 const local_ordinal_type rowidx = member.league_rank();
599 const local_ordinal_type partidx = rowidx2part(rowidx);
600 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
601 const local_ordinal_type v = partidx % vector_length;
603 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
604 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
605 const local_ordinal_type num_local_rows = lclrow.extent(0);
608 auto bb = Kokkos::subview(b, block_range, 0);
610 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
611 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
614 impl_scalar_type *local_Ax =
reinterpret_cast<impl_scalar_type *
>(member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
615 impl_scalar_type *local_x =
reinterpret_cast<impl_scalar_type *
>(member.thread_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
617 const local_ordinal_type lr = lclrow(rowidx);
618 const local_ordinal_type row = lr * blocksize;
619 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
621 member.team_barrier();
624 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), [&](
const local_ordinal_type &i) {
627 member.team_barrier();
630 if constexpr (!overlap) {
631 numEntries = A_x_offsets.extent(2);
633 numEntries = A_x_offsets_remote.extent(2);
636 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numEntries),
638 int64_t A_offset = overlap ? A_x_offsets_remote(rowidx, 0, k) : A_x_offsets(rowidx, 0, k);
639 int64_t x_offset = overlap ? A_x_offsets_remote(rowidx, 1, k) : A_x_offsets(rowidx, 1, k);
640 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
641 A_block_cst.assign_data(tpetra_values.data() + A_offset);
643 if constexpr (async) {
644 size_type remote_cutoff = blocksize * num_local_rows;
645 if (x_offset >= remote_cutoff)
646 xx.assign_data(&x_remote(x_offset - remote_cutoff, col));
648 xx.assign_data(&x(x_offset, col));
650 if constexpr (!overlap) {
651 xx.assign_data(&x(x_offset, col));
653 xx.assign_data(&x_remote(x_offset, col));
657 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](
const local_ordinal_type &i) {
662 Kokkos::parallel_for(
663 Kokkos::ThreadVectorRange(member, blocksize),
664 [&](
const local_ordinal_type &k0) {
665 impl_scalar_type val = 0;
666 for (
int k1 = 0; k1 < blocksize; k1++)
667 val += A_block_cst(k0, k1) * local_x[k1];
668 Kokkos::atomic_add(local_Ax + k0, val);
672 member.team_barrier();
674 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
675 bb.assign_data(&b(row, col));
676 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), [&](
const local_ordinal_type &i) {
678 yy(i) = bb(i) - local_Ax[i];
680 yy(i) -= local_Ax[i];
686 template <
int B,
bool async,
bool overlap>
687 KOKKOS_INLINE_FUNCTION
void
688 operator()(
const GeneralTag<B, async, overlap, false> &,
const member_type &member)
const {
689 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
692 const local_ordinal_type rowidx = member.league_rank();
693 const local_ordinal_type partidx = rowidx2part(rowidx);
694 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
695 const local_ordinal_type v = partidx % vector_length;
697 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
698 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
699 const local_ordinal_type num_local_rows = lclrow.extent(0);
702 auto bb = Kokkos::subview(b, block_range, 0);
705 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
706 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
707 auto colindsub_used = overlap ? colindsub_remote : colindsub;
708 auto rowptr_used = overlap ? rowptr_remote : rowptr;
710 const local_ordinal_type lr = lclrow(rowidx);
711 const local_ordinal_type row = lr * blocksize;
712 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
713 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
716 bb.assign_data(&b(row, col));
717 if (member.team_rank() == 0)
718 VectorCopy(member, blocksize, bb, yy);
719 member.team_barrier();
723 const size_type A_k0 = A_block_rowptr[lr];
724 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr + 1]),
725 [&](
const local_ordinal_type &k) {
726 const size_type j = A_k0 + colindsub_used[k];
727 const local_ordinal_type A_colind_at_j = A_colind[j];
728 if ((async && A_colind_at_j < num_local_rows) || (!async && !overlap)) {
729 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
730 xx.assign_data(&x(loc * blocksize, col));
731 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
732 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx, yy);
734 const auto loc = A_colind_at_j - num_local_rows;
735 xx_remote.assign_data(&x_remote(loc * blocksize, col));
736 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
737 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
745template <
typename MatrixType>
746void ComputeResidualVector<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::run(
const impl_scalar_type_2d_view_tpetra &y_,
747 const Const<impl_scalar_type_2d_view_tpetra> &b_,
748 const impl_scalar_type_2d_view_tpetra &x_) {
749 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
750 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<SeqTag>", ComputeResidual0, execution_space);
752 using Functor = ComputeResidualFunctor<typename impl_type::matrix_type>;
753 Functor functor(*
this);
758 if constexpr (is_device<execution_space>::value) {
759 const local_ordinal_type blocksize = blocksize_requested;
760 const local_ordinal_type team_size = 8;
761 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
762 const Kokkos::TeamPolicy<execution_space, typename Functor::SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
763 Kokkos::parallel_for(
"ComputeResidual::TeamPolicy::run<SeqTag>", policy, functor);
765 const Kokkos::RangePolicy<execution_space, typename Functor::SeqTag> policy(0, rowptr.extent(0) - 1);
766 Kokkos::parallel_for(
"ComputeResidual::RangePolicy::run<SeqTag>", policy, functor);
768 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
769 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
773template <
typename MatrixType>
774void ComputeResidualVector<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::run(
const vector_type_3d_view &y_packed_,
775 const Const<impl_scalar_type_2d_view_tpetra> &b_,
776 const impl_scalar_type_2d_view_tpetra &x_,
777 const impl_scalar_type_2d_view_tpetra &x_remote_) {
778 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
779 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<AsyncTag>", ComputeResidual0, execution_space);
781 using Functor = ComputeResidualFunctor<typename impl_type::matrix_type>;
782 Functor functor(*
this);
786 functor.x_remote = x_remote_;
787 if constexpr (is_device<execution_space>::value) {
788 functor.y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type *)y_packed_.data(),
794 functor.y_packed = y_packed_;
797 if constexpr (is_device<execution_space>::value) {
798 const local_ordinal_type blocksize = blocksize_requested;
803#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
805 if (this->hasBlockCrsMatrix) { \
806 const local_ordinal_type team_size = 8; \
807 const local_ordinal_type vector_size = 8; \
808 const size_t shmem_team_size = blocksize * sizeof(btdm_scalar_type); \
809 const size_t shmem_thread_size = blocksize * sizeof(btdm_scalar_type); \
810 Kokkos::TeamPolicy<execution_space, typename Functor::template AsyncTag<B, true>> \
811 policy(rowidx2part.extent(0), team_size, vector_size); \
812 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
813 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<AsyncTag>", \
816 const local_ordinal_type team_size = 8; \
817 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
818 const Kokkos::TeamPolicy<execution_space, typename Functor::template AsyncTag<B, false>> \
819 policy(rowidx2part.extent(0), team_size, vector_size); \
820 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<AsyncTag>", \
825 switch (blocksize_requested) {
826 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
827 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
828 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
829 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
830 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
831 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
832 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
833 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
834 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
835 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
837#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
839#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
841 if (this->hasBlockCrsMatrix) { \
842 const Kokkos::RangePolicy<execution_space, typename Functor::template AsyncTag<B, true>> policy(0, rowidx2part.extent(0)); \
843 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<AsyncTag>", \
846 const Kokkos::RangePolicy<execution_space, typename Functor::template AsyncTag<B, false>> policy(0, rowidx2part.extent(0)); \
847 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<AsyncTag>", \
853 switch (blocksize_requested) {
854 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
855 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
856 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
857 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
858 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
859 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
860 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
861 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
862 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
863 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
865#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
867 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
868 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
872template <
typename MatrixType>
873void ComputeResidualVector<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::run(
const vector_type_3d_view &y_packed_,
874 const Const<impl_scalar_type_2d_view_tpetra> &b_,
875 const impl_scalar_type_2d_view_tpetra &x_,
876 const impl_scalar_type_2d_view_tpetra &x_remote_,
877 const bool compute_owned) {
878 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
879 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<OverlapTag>", ComputeResidual0, execution_space);
881 using Functor = ComputeResidualFunctor<typename impl_type::matrix_type>;
882 Functor functor(*
this);
886 functor.x_remote = x_remote_;
887 if constexpr (is_device<execution_space>::value) {
888 functor.y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type *)y_packed_.data(),
894 functor.y_packed = y_packed_;
897 if constexpr (is_device<execution_space>::value) {
898 const local_ordinal_type blocksize = blocksize_requested;
903#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
904 if (this->hasBlockCrsMatrix) { \
905 const local_ordinal_type team_size = 8; \
906 const local_ordinal_type vector_size = 8; \
907 const size_t shmem_team_size = blocksize * sizeof(btdm_scalar_type); \
908 const size_t shmem_thread_size = blocksize * sizeof(btdm_scalar_type); \
909 if (compute_owned) { \
910 Kokkos::TeamPolicy<execution_space, typename Functor::template OverlapTag<0, B, true>> \
911 policy(rowidx2part.extent(0), team_size, vector_size); \
912 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
913 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, functor); \
915 Kokkos::TeamPolicy<execution_space, typename Functor::template OverlapTag<1, B, true>> \
916 policy(rowidx2part.extent(0), team_size, vector_size); \
917 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
918 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, functor); \
921 const local_ordinal_type team_size = 8; \
922 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
923 if (compute_owned) { \
924 const Kokkos::TeamPolicy<execution_space, typename Functor::template OverlapTag<0, B, false>> \
925 policy(rowidx2part.extent(0), team_size, vector_size); \
926 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, functor); \
928 const Kokkos::TeamPolicy<execution_space, typename Functor::template OverlapTag<1, B, false>> \
929 policy(rowidx2part.extent(0), team_size, vector_size); \
930 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, functor); \
934 switch (blocksize_requested) {
935 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
936 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
937 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
938 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
939 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
940 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
941 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
942 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
943 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
944 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
946#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
948#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
949 if (this->hasBlockCrsMatrix) { \
950 if (compute_owned) { \
951 const Kokkos::RangePolicy<execution_space, typename Functor::template OverlapTag<0, B, true>> \
952 policy(0, rowidx2part.extent(0)); \
953 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, functor); \
955 const Kokkos::RangePolicy<execution_space, typename Functor::template OverlapTag<1, B, true>> \
956 policy(0, rowidx2part.extent(0)); \
957 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, functor); \
960 if (compute_owned) { \
961 const Kokkos::RangePolicy<execution_space, typename Functor::template OverlapTag<0, B, false>> \
962 policy(0, rowidx2part.extent(0)); \
963 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, functor); \
965 const Kokkos::RangePolicy<execution_space, typename Functor::template OverlapTag<1, B, false>> \
966 policy(0, rowidx2part.extent(0)); \
967 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, functor); \
972 switch (blocksize_requested) {
973 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
974 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
975 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
976 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
977 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
978 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
979 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
980 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
981 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
982 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
984#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
986 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
987 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
992#define IFPACK2_BLOCKCOMPUTERESIDUALVECTOR_INSTANT(S, LO, GO, N) \
993 template class Ifpack2::BlockHelperDetails::ComputeResidualVector<Tpetra::RowMatrix<S, LO, GO, N>>;
size_t size_type
Definition Ifpack2_BlockHelper.hpp:274
node_type::device_type node_device_type
Definition Ifpack2_BlockHelper.hpp:298
KokkosKernels::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition Ifpack2_BlockHelper.hpp:284
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition Ifpack2_BlockHelper.hpp:347