10#ifndef IFPACK2_BLOCKCOMPUTERES_AND_SOLVE_DEF_HPP
11#define IFPACK2_BLOCKCOMPUTERES_AND_SOLVE_DEF_HPP
13#include "Ifpack2_BlockComputeResidualAndSolve_decl.hpp"
15namespace Ifpack2::BlockHelperDetails {
17template <
typename MatrixType,
int B>
18struct ComputeResidualAndSolve_SinglePass_Impl {
19 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
21 using execution_space =
typename impl_type::execution_space;
22 using memory_space =
typename impl_type::memory_space;
24 using local_ordinal_type =
typename impl_type::local_ordinal_type;
27 using magnitude_type =
typename impl_type::magnitude_type;
29 using local_ordinal_type_1d_view =
30 typename impl_type::local_ordinal_type_1d_view;
32 using tpetra_block_access_view_type =
33 typename impl_type::tpetra_block_access_view_type;
35 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
36 using impl_scalar_type_2d_view_tpetra =
37 typename impl_type::impl_scalar_type_2d_view_tpetra;
39 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
40 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
41 using i64_3d_view =
typename impl_type::i64_3d_view;
44 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
47 enum :
int { max_blocksize = 32 };
50 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
51 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
52 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
53 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
56 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
59 const local_ordinal_type blocksize_requested;
62 const ConstUnmanaged<i64_3d_view> A_x_offsets;
63 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
66 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
69 const Unmanaged<impl_scalar_type_1d_view> W;
71 impl_scalar_type damping_factor;
74 ComputeResidualAndSolve_SinglePass_Impl(
const AmD<MatrixType>& amd,
75 const btdm_scalar_type_3d_view& d_inv_,
76 const impl_scalar_type_1d_view& W_,
77 const local_ordinal_type& blocksize_requested_,
78 const impl_scalar_type& damping_factor_)
79 : tpetra_values(amd.tpetra_values)
80 , blocksize_requested(blocksize_requested_)
81 , A_x_offsets(amd.A_x_offsets)
82 , A_x_offsets_remote(amd.A_x_offsets_remote)
85 , damping_factor(damping_factor_) {}
87 KOKKOS_INLINE_FUNCTION
88 void operator()(
const member_type& member)
const {
89 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
90 const local_ordinal_type rowidx = member.league_rank();
91 const local_ordinal_type row = rowidx * blocksize;
92 const local_ordinal_type num_vectors = b.extent(1);
93 const local_ordinal_type num_local_rows = d_inv.extent(0);
95 const impl_scalar_type* xx;
96 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
97 tpetra_values.data(), blocksize, blocksize);
100 impl_scalar_type* local_residual =
reinterpret_cast<impl_scalar_type*
>(
101 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
102 impl_scalar_type* local_Dinv_residual =
reinterpret_cast<impl_scalar_type*
>(
103 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
104 impl_scalar_type* local_x =
105 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
106 blocksize *
sizeof(impl_scalar_type)));
108 magnitude_type norm = 0;
109 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
110 if (col) member.team_barrier();
113 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
114 [&](
const local_ordinal_type& i) {
115 local_Dinv_residual[i] = 0;
116 local_residual[i] = b(row + i, col);
118 member.team_barrier();
120 int numEntries = A_x_offsets.extent(2);
122 Kokkos::parallel_for(
123 Kokkos::TeamThreadRange(member, 0, numEntries), [&](
const int k) {
124 int64_t A_offset = A_x_offsets(rowidx, 0, k);
125 int64_t x_offset = A_x_offsets(rowidx, 1, k);
126 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
127 A_block_cst.assign_data(tpetra_values.data() + A_offset);
129 int64_t remote_cutoff = blocksize * num_local_rows;
130 if (x_offset >= remote_cutoff)
131 xx = &x_remote(x_offset - remote_cutoff, col);
133 xx = &x(x_offset, col);
135 Kokkos::parallel_for(
136 Kokkos::ThreadVectorRange(member, blocksize),
137 [&](
const local_ordinal_type& i) { local_x[i] = xx[i]; });
140 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
142 impl_scalar_type val = 0;
143 for (
int k1 = 0; k1 < blocksize; k1++)
144 val += A_block_cst(k0, k1) * local_x[k1];
145 Kokkos::atomic_add(local_residual + k0, -val);
149 member.team_barrier();
151 Kokkos::parallel_for(
152 Kokkos::TeamThreadRange(member, blocksize),
153 [&](
const local_ordinal_type& k0) {
154 Kokkos::parallel_reduce(
155 Kokkos::ThreadVectorRange(member, blocksize),
156 [&](
const local_ordinal_type& k1, impl_scalar_type& update) {
157 update += d_inv(rowidx, k0, k1) * local_residual[k1];
159 local_Dinv_residual[k0]);
161 member.team_barrier();
164 magnitude_type colNorm;
165 Kokkos::parallel_reduce(
166 Kokkos::TeamVectorRange(member, blocksize),
167 [&](
const local_ordinal_type& k, magnitude_type& update) {
170 impl_scalar_type old_y = x(row + k, col);
171 impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
172 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
173 magnitude_type ydiff =
174 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
175 update += ydiff * ydiff;
177 update += y_update * y_update;
179 y(row + k, col) = old_y + damping_factor * y_update;
184 Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
187 void run(
const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
188 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
189 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
190 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
191 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
192 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
193 "BlockTriDi::ComputeResidualAndSolve::RunSinglePass",
194 ComputeResidualAndSolve0, execution_space);
199 x_remote = x_remote_;
201 const local_ordinal_type blocksize = blocksize_requested;
202 const local_ordinal_type nrows = d_inv.extent(0);
204 const local_ordinal_type team_size = 8;
205 const local_ordinal_type vector_size = 8;
207 const size_t shmem_team_size = 2 * blocksize *
sizeof(impl_scalar_type);
209 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
210 Kokkos::TeamPolicy<execution_space> policy(nrows, team_size, vector_size);
211 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
212 Kokkos::PerThread(shmem_thread_size));
213 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::SinglePass",
216 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
217 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
221template <
typename MatrixType,
int B>
222struct ComputeResidualAndSolve_2Pass_Impl {
223 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
225 using execution_space =
typename impl_type::execution_space;
226 using memory_space =
typename impl_type::memory_space;
228 using local_ordinal_type =
typename impl_type::local_ordinal_type;
231 using magnitude_type =
typename impl_type::magnitude_type;
233 using local_ordinal_type_1d_view =
234 typename impl_type::local_ordinal_type_1d_view;
236 using tpetra_block_access_view_type =
237 typename impl_type::tpetra_block_access_view_type;
239 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
240 using impl_scalar_type_2d_view_tpetra =
241 typename impl_type::impl_scalar_type_2d_view_tpetra;
243 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
244 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
245 using i64_3d_view =
typename impl_type::i64_3d_view;
248 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
251 enum :
int { max_blocksize = 32 };
258 struct NonownedTag {};
261 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
262 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
263 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
264 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
267 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
270 const local_ordinal_type blocksize_requested;
273 const ConstUnmanaged<i64_3d_view> A_x_offsets;
274 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
277 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
280 const Unmanaged<impl_scalar_type_1d_view> W;
282 impl_scalar_type damping_factor;
285 ComputeResidualAndSolve_2Pass_Impl(
286 const AmD<MatrixType>& amd,
287 const btdm_scalar_type_3d_view& d_inv_,
288 const impl_scalar_type_1d_view& W_,
289 const local_ordinal_type& blocksize_requested_,
290 const impl_scalar_type& damping_factor_)
291 : tpetra_values(amd.tpetra_values)
292 , blocksize_requested(blocksize_requested_)
293 , A_x_offsets(amd.A_x_offsets)
294 , A_x_offsets_remote(amd.A_x_offsets_remote)
297 , damping_factor(damping_factor_) {}
299 KOKKOS_INLINE_FUNCTION
300 void operator()(
const OwnedTag,
const member_type& member)
const {
301 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
302 const local_ordinal_type rowidx = member.league_rank();
303 const local_ordinal_type row = rowidx * blocksize;
304 const local_ordinal_type num_vectors = b.extent(1);
306 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
307 tpetra_values.data(), blocksize, blocksize);
310 impl_scalar_type* local_residual =
reinterpret_cast<impl_scalar_type*
>(
311 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
312 impl_scalar_type* local_x =
313 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
314 blocksize *
sizeof(impl_scalar_type)));
316 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
317 if (col) member.team_barrier();
320 Kokkos::parallel_for(
321 Kokkos::TeamVectorRange(member, blocksize),
322 [&](
const local_ordinal_type& i) { local_residual[i] = b(row + i, col); });
323 member.team_barrier();
325 int numEntries = A_x_offsets.extent(2);
327 Kokkos::parallel_for(
328 Kokkos::TeamThreadRange(member, 0, numEntries), [&](
const int k) {
329 int64_t A_offset = A_x_offsets(rowidx, 0, k);
330 int64_t x_offset = A_x_offsets(rowidx, 1, k);
331 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
332 A_block_cst.assign_data(tpetra_values.data() + A_offset);
334 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
335 [&](
const local_ordinal_type& i) {
336 local_x[i] = x(x_offset + i, col);
340 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
341 [&](
const local_ordinal_type& k0) {
342 impl_scalar_type val = 0;
343 for (
int k1 = 0; k1 < blocksize; k1++)
344 val += A_block_cst(k0, k1) * local_x[k1];
345 Kokkos::atomic_add(local_residual + k0, -val);
349 member.team_barrier();
351 if (member.team_rank() == 0) {
352 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
353 [&](
const local_ordinal_type& k) {
354 y(row + k, col) = local_residual[k];
360 KOKKOS_INLINE_FUNCTION
361 void operator()(
const NonownedTag,
const member_type& member)
const {
362 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
363 const local_ordinal_type rowidx = member.league_rank();
364 const local_ordinal_type row = rowidx * blocksize;
365 const local_ordinal_type num_vectors = y.extent(1);
367 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
368 tpetra_values.data(), blocksize, blocksize);
371 impl_scalar_type* local_residual =
reinterpret_cast<impl_scalar_type*
>(
372 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
373 impl_scalar_type* local_Dinv_residual =
reinterpret_cast<impl_scalar_type*
>(
374 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
375 impl_scalar_type* local_x =
376 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
377 blocksize *
sizeof(impl_scalar_type)));
379 magnitude_type norm = 0;
380 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
381 if (col) member.team_barrier();
384 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
385 [&](
const local_ordinal_type& i) {
386 local_Dinv_residual[i] = 0;
387 local_residual[i] = y(row + i, col);
389 member.team_barrier();
391 int numEntries = A_x_offsets_remote.extent(2);
393 Kokkos::parallel_for(
394 Kokkos::TeamThreadRange(member, 0, numEntries), [&](
const int k) {
395 int64_t A_offset = A_x_offsets_remote(rowidx, 0, k);
396 int64_t x_offset = A_x_offsets_remote(rowidx, 1, k);
397 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
398 A_block_cst.assign_data(tpetra_values.data() + A_offset);
400 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
401 [&](
const local_ordinal_type& i) {
402 local_x[i] = x_remote(x_offset + i, col);
406 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
408 impl_scalar_type val = 0;
409 for (
int k1 = 0; k1 < blocksize; k1++)
410 val += A_block_cst(k0, k1) * local_x[k1];
411 Kokkos::atomic_add(local_residual + k0, -val);
415 member.team_barrier();
417 Kokkos::parallel_for(
418 Kokkos::TeamThreadRange(member, blocksize),
419 [&](
const local_ordinal_type& k0) {
420 Kokkos::parallel_reduce(
421 Kokkos::ThreadVectorRange(member, blocksize),
422 [&](
const local_ordinal_type& k1, impl_scalar_type& update) {
423 update += d_inv(rowidx, k0, k1) * local_residual[k1];
425 local_Dinv_residual[k0]);
427 member.team_barrier();
430 magnitude_type colNorm;
431 Kokkos::parallel_reduce(
432 Kokkos::TeamVectorRange(member, blocksize),
433 [&](
const local_ordinal_type& k, magnitude_type& update) {
436 impl_scalar_type old_y = x(row + k, col);
437 impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
438 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
439 magnitude_type ydiff =
440 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
441 update += ydiff * ydiff;
443 update += y_update * y_update;
445 y(row + k, col) = old_y + damping_factor * y_update;
450 Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
455 void run_pass1(
const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
456 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
457 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
458 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
459 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
460 "BlockTriDi::ComputeResidualAndSolve::RunPass1",
461 ComputeResidualAndSolve0, execution_space);
467 const local_ordinal_type blocksize = blocksize_requested;
468 const local_ordinal_type nrows = d_inv.extent(0);
470 const local_ordinal_type team_size = 8;
471 const local_ordinal_type vector_size = 8;
472 const size_t shmem_team_size = blocksize *
sizeof(impl_scalar_type);
473 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
474 Kokkos::TeamPolicy<execution_space, OwnedTag> policy(nrows, team_size,
476 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
477 Kokkos::PerThread(shmem_thread_size));
478 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::Pass1", policy,
480 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
481 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
488 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
489 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
490 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
491 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
492 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
493 "BlockTriDi::ComputeResidualAndSolve::RunPass2",
494 ComputeResidualAndSolve0, execution_space);
497 x_remote = x_remote_;
500 const local_ordinal_type blocksize = blocksize_requested;
501 const local_ordinal_type nrows = d_inv.extent(0);
503 const local_ordinal_type team_size = 8;
504 const local_ordinal_type vector_size = 8;
505 const size_t shmem_team_size = 2 * blocksize *
sizeof(impl_scalar_type);
506 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
507 Kokkos::TeamPolicy<execution_space, NonownedTag> policy(nrows, team_size,
509 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
510 Kokkos::PerThread(shmem_thread_size));
511 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::Pass2", policy,
513 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
514 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
518template <
typename MatrixType,
int B>
519struct ComputeResidualAndSolve_YZero_Impl {
520 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
522 using execution_space =
typename impl_type::execution_space;
523 using memory_space =
typename impl_type::memory_space;
525 using local_ordinal_type =
typename impl_type::local_ordinal_type;
528 using magnitude_type =
typename impl_type::magnitude_type;
530 using local_ordinal_type_1d_view =
531 typename impl_type::local_ordinal_type_1d_view;
533 using tpetra_block_access_view_type =
534 typename impl_type::tpetra_block_access_view_type;
536 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
537 using impl_scalar_type_2d_view_tpetra =
538 typename impl_type::impl_scalar_type_2d_view_tpetra;
540 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
541 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
542 using i64_3d_view =
typename impl_type::i64_3d_view;
545 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
548 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
549 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
552 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
555 const local_ordinal_type blocksize_requested;
558 const ConstUnmanaged<i64_3d_view> A_x_offsets;
559 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
562 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
565 const Unmanaged<impl_scalar_type_1d_view> W;
567 impl_scalar_type damping_factor;
570 ComputeResidualAndSolve_YZero_Impl(
571 const AmD<MatrixType>& amd,
const btdm_scalar_type_3d_view& d_inv_,
572 const impl_scalar_type_1d_view& W_,
573 const local_ordinal_type& blocksize_requested_,
574 const impl_scalar_type& damping_factor_)
575 : tpetra_values(amd.tpetra_values)
576 , blocksize_requested(blocksize_requested_)
577 , A_x_offsets(amd.A_x_offsets)
578 , A_x_offsets_remote(amd.A_x_offsets_remote)
581 , damping_factor(damping_factor_) {}
583 KOKKOS_INLINE_FUNCTION
584 void operator()(
const member_type& member)
const {
585 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
586 const local_ordinal_type rowidx =
587 member.league_rank() * member.team_size() + member.team_rank();
588 const local_ordinal_type row = rowidx * blocksize;
589 const local_ordinal_type num_vectors = b.extent(1);
592 impl_scalar_type* local_Dinv_residual =
593 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
594 blocksize *
sizeof(impl_scalar_type)));
596 if (rowidx >= (local_ordinal_type)d_inv.extent(0))
return;
598 magnitude_type norm = 0;
599 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
601 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
602 [&](
const local_ordinal_type& k0) {
603 impl_scalar_type val = 0;
604 for (local_ordinal_type k1 = 0; k1 < blocksize;
606 val += d_inv(rowidx, k0, k1) * b(row + k1, col);
608 local_Dinv_residual[k0] = val;
611 magnitude_type colNorm;
612 Kokkos::parallel_reduce(
613 Kokkos::ThreadVectorRange(member, blocksize),
614 [&](
const local_ordinal_type& k, magnitude_type& update) {
617 impl_scalar_type y_update = local_Dinv_residual[k];
618 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
619 magnitude_type ydiff =
620 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
621 update += ydiff * ydiff;
623 update += y_update * y_update;
625 y(row + k, col) = damping_factor * y_update;
630 Kokkos::single(Kokkos::PerThread(member), [&]() { W(rowidx) = norm; });
637 void run(
const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
638 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
639 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
640 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
641 "BlockTriDi::ComputeResidualAndSolve::Run_Y_Zero",
642 ComputeResidualAndSolve0, execution_space);
647 const local_ordinal_type blocksize = blocksize_requested;
648 const local_ordinal_type nrows = d_inv.extent(0);
650 const local_ordinal_type team_size = 8;
651 const local_ordinal_type vector_size = 8;
652 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
653 Kokkos::TeamPolicy<execution_space> policy(
654 (nrows + team_size - 1) / team_size, team_size, vector_size);
655 policy.set_scratch_size(0, Kokkos::PerThread(shmem_thread_size));
656 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::y_zero", policy, *
this);
657 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
658 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
666template <
typename MatrixType>
667void ComputeResidualAndSolve<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::run_y_zero(
668 const Const<impl_scalar_type_2d_view_tpetra>& b_,
669 const impl_scalar_type_2d_view_tpetra& y_) {
672 ComputeResidualAndSolve_YZero_Impl<MatrixType, B> functor(amd, d_inv, W, blocksize_requested, damping_factor); \
673 functor.run(b_, y_); \
677 switch (blocksize_requested) {
682 case 10: RUN_CASE(10);
683 case 11: RUN_CASE(11);
684 case 16: RUN_CASE(16);
685 case 17: RUN_CASE(17);
686 case 18: RUN_CASE(18);
687 default: RUN_CASE(0);
692template <
typename MatrixType>
693void ComputeResidualAndSolve<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::run_single_pass(
694 const Const<impl_scalar_type_2d_view_tpetra>& b_,
695 const impl_scalar_type_2d_view_tpetra& x_,
696 const impl_scalar_type_2d_view_tpetra& x_remote_,
697 const impl_scalar_type_2d_view_tpetra& y_) {
700 ComputeResidualAndSolve_SinglePass_Impl<MatrixType, B> functor(amd, d_inv, W, blocksize_requested, damping_factor); \
701 functor.run(b_, x_, x_remote_, y_); \
705 switch (blocksize_requested) {
710 case 10: RUN_CASE(10);
711 case 11: RUN_CASE(11);
712 case 16: RUN_CASE(16);
713 case 17: RUN_CASE(17);
714 case 18: RUN_CASE(18);
715 default: RUN_CASE(0);
720template <
typename MatrixType>
721void ComputeResidualAndSolve<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::run_pass1_of_2(
722 const Const<impl_scalar_type_2d_view_tpetra>& b_,
723 const impl_scalar_type_2d_view_tpetra& x_,
724 const impl_scalar_type_2d_view_tpetra& y_) {
727 ComputeResidualAndSolve_2Pass_Impl<MatrixType, B> functor(amd, d_inv, W, blocksize_requested, damping_factor); \
728 functor.run_pass1(b_, x_, y_); \
732 switch (blocksize_requested) {
737 case 10: RUN_CASE(10);
738 case 11: RUN_CASE(11);
739 case 16: RUN_CASE(16);
740 case 17: RUN_CASE(17);
741 case 18: RUN_CASE(18);
742 default: RUN_CASE(0);
747template <
typename MatrixType>
748void ComputeResidualAndSolve<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::run_pass2_of_2(
749 const impl_scalar_type_2d_view_tpetra& x_,
750 const impl_scalar_type_2d_view_tpetra& x_remote_,
751 const impl_scalar_type_2d_view_tpetra& y_) {
754 ComputeResidualAndSolve_2Pass_Impl<MatrixType, B> functor(amd, d_inv, W, blocksize_requested, damping_factor); \
755 functor.run_pass2(x_, x_remote_, y_); \
759 switch (blocksize_requested) {
764 case 10: RUN_CASE(10);
765 case 11: RUN_CASE(11);
766 case 16: RUN_CASE(16);
767 case 17: RUN_CASE(17);
768 case 18: RUN_CASE(18);
769 default: RUN_CASE(0);
776#define IFPACK2_BLOCKCOMPUTERESIDUALANDSOLVE_INSTANT(S, LO, GO, N) \
777 template class Ifpack2::BlockHelperDetails::ComputeResidualAndSolve<Tpetra::RowMatrix<S, LO, GO, N> >;
size_t size_type
Definition Ifpack2_BlockHelper.hpp:274
node_type::device_type node_device_type
Definition Ifpack2_BlockHelper.hpp:298
KokkosKernels::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition Ifpack2_BlockHelper.hpp:284
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition Ifpack2_BlockHelper.hpp:347