Ifpack2 Templated Preconditioning Package Version 1.0
Loading...
Searching...
No Matches
Ifpack2_BlockComputeResidualAndSolve_def.hpp
1// @HEADER
2// *****************************************************************************
3// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4//
5// Copyright 2009 NTESS and the Ifpack2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef IFPACK2_BLOCKCOMPUTERES_AND_SOLVE_DEF_HPP
11#define IFPACK2_BLOCKCOMPUTERES_AND_SOLVE_DEF_HPP
12
13#include "Ifpack2_BlockComputeResidualAndSolve_decl.hpp"
14
15namespace Ifpack2::BlockHelperDetails {
16
17template <typename MatrixType, int B>
18struct ComputeResidualAndSolve_SinglePass_Impl {
19 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
20 using node_device_type = typename impl_type::node_device_type;
21 using execution_space = typename impl_type::execution_space;
22 using memory_space = typename impl_type::memory_space;
23
24 using local_ordinal_type = typename impl_type::local_ordinal_type;
25 using size_type = typename impl_type::size_type;
26 using impl_scalar_type = typename impl_type::impl_scalar_type;
27 using magnitude_type = typename impl_type::magnitude_type;
29 using local_ordinal_type_1d_view =
30 typename impl_type::local_ordinal_type_1d_view;
31 using size_type_1d_view = typename impl_type::size_type_1d_view;
32 using tpetra_block_access_view_type =
33 typename impl_type::tpetra_block_access_view_type; // block crs (layout
34 // right)
35 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
36 using impl_scalar_type_2d_view_tpetra =
37 typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
38 // (layout left)
39 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
40 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
41 using i64_3d_view = typename impl_type::i64_3d_view;
42
44 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
45
46 // enum for max blocksize and vector length
47 enum : int { max_blocksize = 32 };
48
49 private:
50 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
51 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
52 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
53 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
54
55 // AmD information
56 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
57
58 // blocksize
59 const local_ordinal_type blocksize_requested;
60
61 // block offsets
62 const ConstUnmanaged<i64_3d_view> A_x_offsets;
63 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
64
65 // diagonal block inverses
66 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
67
68 // squared update norms
69 const Unmanaged<impl_scalar_type_1d_view> W;
70
71 impl_scalar_type damping_factor;
72
73 public:
74 ComputeResidualAndSolve_SinglePass_Impl(const AmD<MatrixType>& amd,
75 const btdm_scalar_type_3d_view& d_inv_,
76 const impl_scalar_type_1d_view& W_,
77 const local_ordinal_type& blocksize_requested_,
78 const impl_scalar_type& damping_factor_)
79 : tpetra_values(amd.tpetra_values)
80 , blocksize_requested(blocksize_requested_)
81 , A_x_offsets(amd.A_x_offsets)
82 , A_x_offsets_remote(amd.A_x_offsets_remote)
83 , d_inv(d_inv_)
84 , W(W_)
85 , damping_factor(damping_factor_) {}
86
87 KOKKOS_INLINE_FUNCTION
88 void operator()(const member_type& member) const {
89 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
90 const local_ordinal_type rowidx = member.league_rank();
91 const local_ordinal_type row = rowidx * blocksize;
92 const local_ordinal_type num_vectors = b.extent(1);
93 const local_ordinal_type num_local_rows = d_inv.extent(0);
94
95 const impl_scalar_type* xx;
96 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
97 tpetra_values.data(), blocksize, blocksize);
98
99 // Get shared allocation for a local copy of x, residual, and A
100 impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
101 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
102 impl_scalar_type* local_Dinv_residual = reinterpret_cast<impl_scalar_type*>(
103 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
104 impl_scalar_type* local_x =
105 reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
106 blocksize * sizeof(impl_scalar_type)));
107
108 magnitude_type norm = 0;
109 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
110 if (col) member.team_barrier();
111 // y -= Rx
112 // Initialize accumulation arrays
113 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
114 [&](const local_ordinal_type& i) {
115 local_Dinv_residual[i] = 0;
116 local_residual[i] = b(row + i, col);
117 });
118 member.team_barrier();
119
120 int numEntries = A_x_offsets.extent(2);
121
122 Kokkos::parallel_for(
123 Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
124 int64_t A_offset = A_x_offsets(rowidx, 0, k);
125 int64_t x_offset = A_x_offsets(rowidx, 1, k);
126 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
127 A_block_cst.assign_data(tpetra_values.data() + A_offset);
128 // Pull x into local memory
129 int64_t remote_cutoff = blocksize * num_local_rows;
130 if (x_offset >= remote_cutoff)
131 xx = &x_remote(x_offset - remote_cutoff, col);
132 else
133 xx = &x(x_offset, col);
134
135 Kokkos::parallel_for(
136 Kokkos::ThreadVectorRange(member, blocksize),
137 [&](const local_ordinal_type& i) { local_x[i] = xx[i]; });
138
139 // matvec on block: local_residual -= A_block_cst * local_x
140 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
141 [&](const int k0) {
142 impl_scalar_type val = 0;
143 for (int k1 = 0; k1 < blocksize; k1++)
144 val += A_block_cst(k0, k1) * local_x[k1];
145 Kokkos::atomic_add(local_residual + k0, -val);
146 });
147 }
148 });
149 member.team_barrier();
150 // Compute local_Dinv_residual = D^-1 * local_residual
151 Kokkos::parallel_for(
152 Kokkos::TeamThreadRange(member, blocksize),
153 [&](const local_ordinal_type& k0) {
154 Kokkos::parallel_reduce(
155 Kokkos::ThreadVectorRange(member, blocksize),
156 [&](const local_ordinal_type& k1, impl_scalar_type& update) {
157 update += d_inv(rowidx, k0, k1) * local_residual[k1];
158 },
159 local_Dinv_residual[k0]);
160 });
161 member.team_barrier();
162 // local_Dinv_residual is fully computed. Now compute the
163 // squared y update norm and update y (using damping factor).
164 magnitude_type colNorm;
165 Kokkos::parallel_reduce(
166 Kokkos::TeamVectorRange(member, blocksize),
167 [&](const local_ordinal_type& k, magnitude_type& update) {
168 // Compute the change in y (assuming damping_factor == 1) for this
169 // entry.
170 impl_scalar_type old_y = x(row + k, col);
171 impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
172 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
173 magnitude_type ydiff =
174 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
175 update += ydiff * ydiff;
176 } else {
177 update += y_update * y_update;
178 }
179 y(row + k, col) = old_y + damping_factor * y_update;
180 },
181 colNorm);
182 norm += colNorm;
183 }
184 Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
185 }
186
187 void run(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
188 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
189 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
190 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
191 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
192 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
193 "BlockTriDi::ComputeResidualAndSolve::RunSinglePass",
194 ComputeResidualAndSolve0, execution_space);
195
196 y = y_;
197 b = b_;
198 x = x_;
199 x_remote = x_remote_;
200
201 const local_ordinal_type blocksize = blocksize_requested;
202 const local_ordinal_type nrows = d_inv.extent(0);
203
204 const local_ordinal_type team_size = 8;
205 const local_ordinal_type vector_size = 8;
206 // team: local_residual, local_Dinv_residual
207 const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type);
208 // thread: local_x
209 const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
210 Kokkos::TeamPolicy<execution_space> policy(nrows, team_size, vector_size);
211 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
212 Kokkos::PerThread(shmem_thread_size));
213 Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::SinglePass",
214 policy, *this);
215
216 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
217 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
218 }
219};
220
221template <typename MatrixType, int B>
222struct ComputeResidualAndSolve_2Pass_Impl {
223 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
224 using node_device_type = typename impl_type::node_device_type;
225 using execution_space = typename impl_type::execution_space;
226 using memory_space = typename impl_type::memory_space;
227
228 using local_ordinal_type = typename impl_type::local_ordinal_type;
229 using size_type = typename impl_type::size_type;
230 using impl_scalar_type = typename impl_type::impl_scalar_type;
231 using magnitude_type = typename impl_type::magnitude_type;
233 using local_ordinal_type_1d_view =
234 typename impl_type::local_ordinal_type_1d_view;
235 using size_type_1d_view = typename impl_type::size_type_1d_view;
236 using tpetra_block_access_view_type =
237 typename impl_type::tpetra_block_access_view_type; // block crs (layout
238 // right)
239 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
240 using impl_scalar_type_2d_view_tpetra =
241 typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
242 // (layout left)
243 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
244 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
245 using i64_3d_view = typename impl_type::i64_3d_view;
246
248 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
249
250 // enum for max blocksize and vector length
251 enum : int { max_blocksize = 32 };
252
253 // Tag for computing residual with owned columns only (pass 1)
254 struct OwnedTag {};
255
256 // Tag for finishing the residual with nonowned columns, and solving/norming
257 // (pass 2)
258 struct NonownedTag {};
259
260 private:
261 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
262 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
263 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
264 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
265
266 // AmD information
267 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
268
269 // blocksize
270 const local_ordinal_type blocksize_requested;
271
272 // block offsets
273 const ConstUnmanaged<i64_3d_view> A_x_offsets;
274 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
275
276 // diagonal block inverses
277 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
278
279 // squared update norms
280 const Unmanaged<impl_scalar_type_1d_view> W;
281
282 impl_scalar_type damping_factor;
283
284 public:
285 ComputeResidualAndSolve_2Pass_Impl(
286 const AmD<MatrixType>& amd,
287 const btdm_scalar_type_3d_view& d_inv_,
288 const impl_scalar_type_1d_view& W_,
289 const local_ordinal_type& blocksize_requested_,
290 const impl_scalar_type& damping_factor_)
291 : tpetra_values(amd.tpetra_values)
292 , blocksize_requested(blocksize_requested_)
293 , A_x_offsets(amd.A_x_offsets)
294 , A_x_offsets_remote(amd.A_x_offsets_remote)
295 , d_inv(d_inv_)
296 , W(W_)
297 , damping_factor(damping_factor_) {}
298
299 KOKKOS_INLINE_FUNCTION
300 void operator()(const OwnedTag, const member_type& member) const {
301 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
302 const local_ordinal_type rowidx = member.league_rank();
303 const local_ordinal_type row = rowidx * blocksize;
304 const local_ordinal_type num_vectors = b.extent(1);
305
306 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
307 tpetra_values.data(), blocksize, blocksize);
308
309 // Get shared allocation for a local copy of x, Ax, and A
310 impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
311 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
312 impl_scalar_type* local_x =
313 reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
314 blocksize * sizeof(impl_scalar_type)));
315
316 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
317 if (col) member.team_barrier();
318 // y -= Rx
319 // Initialize accumulation arrays
320 Kokkos::parallel_for(
321 Kokkos::TeamVectorRange(member, blocksize),
322 [&](const local_ordinal_type& i) { local_residual[i] = b(row + i, col); });
323 member.team_barrier();
324
325 int numEntries = A_x_offsets.extent(2);
326
327 Kokkos::parallel_for(
328 Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
329 int64_t A_offset = A_x_offsets(rowidx, 0, k);
330 int64_t x_offset = A_x_offsets(rowidx, 1, k);
331 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
332 A_block_cst.assign_data(tpetra_values.data() + A_offset);
333 // Pull x into local memory
334 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
335 [&](const local_ordinal_type& i) {
336 local_x[i] = x(x_offset + i, col);
337 });
338
339 // MatVec op Ax += A*x
340 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
341 [&](const local_ordinal_type& k0) {
342 impl_scalar_type val = 0;
343 for (int k1 = 0; k1 < blocksize; k1++)
344 val += A_block_cst(k0, k1) * local_x[k1];
345 Kokkos::atomic_add(local_residual + k0, -val);
346 });
347 }
348 });
349 member.team_barrier();
350 // Write back the partial residual to y
351 if (member.team_rank() == 0) {
352 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
353 [&](const local_ordinal_type& k) {
354 y(row + k, col) = local_residual[k];
355 });
356 }
357 }
358 }
359
360 KOKKOS_INLINE_FUNCTION
361 void operator()(const NonownedTag, const member_type& member) const {
362 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
363 const local_ordinal_type rowidx = member.league_rank();
364 const local_ordinal_type row = rowidx * blocksize;
365 const local_ordinal_type num_vectors = y.extent(1);
366
367 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
368 tpetra_values.data(), blocksize, blocksize);
369
370 // Get shared allocation for a local copy of x, Ax, and A
371 impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
372 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
373 impl_scalar_type* local_Dinv_residual = reinterpret_cast<impl_scalar_type*>(
374 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
375 impl_scalar_type* local_x =
376 reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
377 blocksize * sizeof(impl_scalar_type)));
378
379 magnitude_type norm = 0;
380 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
381 if (col) member.team_barrier();
382 // y -= Rx
383 // Initialize accumulation arrays.
384 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
385 [&](const local_ordinal_type& i) {
386 local_Dinv_residual[i] = 0;
387 local_residual[i] = y(row + i, col);
388 });
389 member.team_barrier();
390
391 int numEntries = A_x_offsets_remote.extent(2);
392
393 Kokkos::parallel_for(
394 Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
395 int64_t A_offset = A_x_offsets_remote(rowidx, 0, k);
396 int64_t x_offset = A_x_offsets_remote(rowidx, 1, k);
397 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
398 A_block_cst.assign_data(tpetra_values.data() + A_offset);
399 // Pull x into local memory
400 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
401 [&](const local_ordinal_type& i) {
402 local_x[i] = x_remote(x_offset + i, col);
403 });
404
405 // matvec on block: local_residual -= A_block_cst * local_x
406 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
407 [&](const int k0) {
408 impl_scalar_type val = 0;
409 for (int k1 = 0; k1 < blocksize; k1++)
410 val += A_block_cst(k0, k1) * local_x[k1];
411 Kokkos::atomic_add(local_residual + k0, -val);
412 });
413 }
414 });
415 member.team_barrier();
416 // Compute local_Dinv_residual = D^-1 * local_residual
417 Kokkos::parallel_for(
418 Kokkos::TeamThreadRange(member, blocksize),
419 [&](const local_ordinal_type& k0) {
420 Kokkos::parallel_reduce(
421 Kokkos::ThreadVectorRange(member, blocksize),
422 [&](const local_ordinal_type& k1, impl_scalar_type& update) {
423 update += d_inv(rowidx, k0, k1) * local_residual[k1];
424 },
425 local_Dinv_residual[k0]);
426 });
427 member.team_barrier();
428 // local_Dinv_residual is fully computed. Now compute the
429 // squared y update norm and update y (using damping factor).
430 magnitude_type colNorm;
431 Kokkos::parallel_reduce(
432 Kokkos::TeamVectorRange(member, blocksize),
433 [&](const local_ordinal_type& k, magnitude_type& update) {
434 // Compute the change in y (assuming damping_factor == 1) for this
435 // entry.
436 impl_scalar_type old_y = x(row + k, col);
437 impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
438 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
439 magnitude_type ydiff =
440 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
441 update += ydiff * ydiff;
442 } else {
443 update += y_update * y_update;
444 }
445 y(row + k, col) = old_y + damping_factor * y_update;
446 },
447 colNorm);
448 norm += colNorm;
449 }
450 Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
451 }
452
453 // Launch pass 1 of the 2-pass version.
454 // This computes just the owned part of residual and writes that back to y.
455 void run_pass1(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
456 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
457 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
458 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
459 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
460 "BlockTriDi::ComputeResidualAndSolve::RunPass1",
461 ComputeResidualAndSolve0, execution_space);
462
463 b = b_;
464 x = x_;
465 y = y_;
466
467 const local_ordinal_type blocksize = blocksize_requested;
468 const local_ordinal_type nrows = d_inv.extent(0);
469
470 const local_ordinal_type team_size = 8;
471 const local_ordinal_type vector_size = 8;
472 const size_t shmem_team_size = blocksize * sizeof(impl_scalar_type);
473 const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
474 Kokkos::TeamPolicy<execution_space, OwnedTag> policy(nrows, team_size,
475 vector_size);
476 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
477 Kokkos::PerThread(shmem_thread_size));
478 Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass1", policy,
479 *this);
480 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
481 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
482 }
483
484 // Launch pass 2 of the 2-pass version.
485 // This finishes computing residual with x_remote,
486 // and then applies Dinv and computes norm.
487 void run_pass2(
488 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
489 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
490 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
491 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
492 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
493 "BlockTriDi::ComputeResidualAndSolve::RunPass2",
494 ComputeResidualAndSolve0, execution_space);
495
496 x = x_;
497 x_remote = x_remote_;
498 y = y_;
499
500 const local_ordinal_type blocksize = blocksize_requested;
501 const local_ordinal_type nrows = d_inv.extent(0);
502
503 const local_ordinal_type team_size = 8;
504 const local_ordinal_type vector_size = 8;
505 const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type);
506 const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
507 Kokkos::TeamPolicy<execution_space, NonownedTag> policy(nrows, team_size,
508 vector_size);
509 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
510 Kokkos::PerThread(shmem_thread_size));
511 Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass2", policy,
512 *this);
513 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
514 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
515 }
516};
517
518template <typename MatrixType, int B>
519struct ComputeResidualAndSolve_YZero_Impl {
520 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
521 using node_device_type = typename impl_type::node_device_type;
522 using execution_space = typename impl_type::execution_space;
523 using memory_space = typename impl_type::memory_space;
524
525 using local_ordinal_type = typename impl_type::local_ordinal_type;
526 using size_type = typename impl_type::size_type;
527 using impl_scalar_type = typename impl_type::impl_scalar_type;
528 using magnitude_type = typename impl_type::magnitude_type;
530 using local_ordinal_type_1d_view =
531 typename impl_type::local_ordinal_type_1d_view;
532 using size_type_1d_view = typename impl_type::size_type_1d_view;
533 using tpetra_block_access_view_type =
534 typename impl_type::tpetra_block_access_view_type; // block crs (layout
535 // right)
536 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
537 using impl_scalar_type_2d_view_tpetra =
538 typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
539 // (layout left)
540 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
541 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
542 using i64_3d_view = typename impl_type::i64_3d_view;
543
545 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
546
547 private:
548 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
549 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
550
551 // AmD information
552 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
553
554 // blocksize
555 const local_ordinal_type blocksize_requested;
556
557 // block offsets
558 const ConstUnmanaged<i64_3d_view> A_x_offsets;
559 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
560
561 // diagonal block inverses
562 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
563
564 // squared update norms
565 const Unmanaged<impl_scalar_type_1d_view> W;
566
567 impl_scalar_type damping_factor;
568
569 public:
570 ComputeResidualAndSolve_YZero_Impl(
571 const AmD<MatrixType>& amd, const btdm_scalar_type_3d_view& d_inv_,
572 const impl_scalar_type_1d_view& W_,
573 const local_ordinal_type& blocksize_requested_,
574 const impl_scalar_type& damping_factor_)
575 : tpetra_values(amd.tpetra_values)
576 , blocksize_requested(blocksize_requested_)
577 , A_x_offsets(amd.A_x_offsets)
578 , A_x_offsets_remote(amd.A_x_offsets_remote)
579 , d_inv(d_inv_)
580 , W(W_)
581 , damping_factor(damping_factor_) {}
582
583 KOKKOS_INLINE_FUNCTION
584 void operator()(const member_type& member) const {
585 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
586 const local_ordinal_type rowidx =
587 member.league_rank() * member.team_size() + member.team_rank();
588 const local_ordinal_type row = rowidx * blocksize;
589 const local_ordinal_type num_vectors = b.extent(1);
590
591 // Get shared allocation for a local copy of x, Ax, and A
592 impl_scalar_type* local_Dinv_residual =
593 reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
594 blocksize * sizeof(impl_scalar_type)));
595
596 if (rowidx >= (local_ordinal_type)d_inv.extent(0)) return;
597
598 magnitude_type norm = 0;
599 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
600 // Compute local_Dinv_residual = D^-1 * local_residual
601 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
602 [&](const local_ordinal_type& k0) {
603 impl_scalar_type val = 0;
604 for (local_ordinal_type k1 = 0; k1 < blocksize;
605 k1++) {
606 val += d_inv(rowidx, k0, k1) * b(row + k1, col);
607 }
608 local_Dinv_residual[k0] = val;
609 });
610
611 magnitude_type colNorm;
612 Kokkos::parallel_reduce(
613 Kokkos::ThreadVectorRange(member, blocksize),
614 [&](const local_ordinal_type& k, magnitude_type& update) {
615 // Compute the change in y (assuming damping_factor == 1) for this
616 // entry.
617 impl_scalar_type y_update = local_Dinv_residual[k];
618 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
619 magnitude_type ydiff =
620 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
621 update += ydiff * ydiff;
622 } else {
623 update += y_update * y_update;
624 }
625 y(row + k, col) = damping_factor * y_update;
626 },
627 colNorm);
628 norm += colNorm;
629 }
630 Kokkos::single(Kokkos::PerThread(member), [&]() { W(rowidx) = norm; });
631 }
632
633 // ComputeResidualAndSolve_SolveOnly::run does the solve for the first
634 // iteration, when the initial guess for y is zero. This means the residual
635 // vector is just b. The kernel applies the inverse diags to b to find y, and
636 // also puts the partial squared update norms (1 per row) into W.
637 void run(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
638 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
639 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
640 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
641 "BlockTriDi::ComputeResidualAndSolve::Run_Y_Zero",
642 ComputeResidualAndSolve0, execution_space);
643
644 this->y = y_;
645 this->b = b_;
646
647 const local_ordinal_type blocksize = blocksize_requested;
648 const local_ordinal_type nrows = d_inv.extent(0);
649
650 const local_ordinal_type team_size = 8;
651 const local_ordinal_type vector_size = 8;
652 const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
653 Kokkos::TeamPolicy<execution_space> policy(
654 (nrows + team_size - 1) / team_size, team_size, vector_size);
655 policy.set_scratch_size(0, Kokkos::PerThread(shmem_thread_size));
656 Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::y_zero", policy, *this);
657 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
658 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
659 }
660};
661
662// run_y_zero does the solve for the first
663// iteration, when the initial guess for y is zero. This means the residual
664// vector is just b. The kernel applies the inverse diags to b to find y, and
665// also puts the partial squared update norms (1 per row) into W.
666template <typename MatrixType>
667void ComputeResidualAndSolve<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::run_y_zero(
668 const Const<impl_scalar_type_2d_view_tpetra>& b_,
669 const impl_scalar_type_2d_view_tpetra& y_) {
670#define RUN_CASE(B) \
671 { \
672 ComputeResidualAndSolve_YZero_Impl<MatrixType, B> functor(amd, d_inv, W, blocksize_requested, damping_factor); \
673 functor.run(b_, y_); \
674 break; \
675 }
676
677 switch (blocksize_requested) {
678 case 3: RUN_CASE(3);
679 case 5: RUN_CASE(5);
680 case 7: RUN_CASE(7);
681 case 9: RUN_CASE(9);
682 case 10: RUN_CASE(10);
683 case 11: RUN_CASE(11);
684 case 16: RUN_CASE(16);
685 case 17: RUN_CASE(17);
686 case 18: RUN_CASE(18);
687 default: RUN_CASE(0);
688 }
689#undef RUN_CASE
690}
691
692template <typename MatrixType>
693void ComputeResidualAndSolve<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::run_single_pass(
694 const Const<impl_scalar_type_2d_view_tpetra>& b_,
695 const impl_scalar_type_2d_view_tpetra& x_,
696 const impl_scalar_type_2d_view_tpetra& x_remote_,
697 const impl_scalar_type_2d_view_tpetra& y_) {
698#define RUN_CASE(B) \
699 { \
700 ComputeResidualAndSolve_SinglePass_Impl<MatrixType, B> functor(amd, d_inv, W, blocksize_requested, damping_factor); \
701 functor.run(b_, x_, x_remote_, y_); \
702 break; \
703 }
704
705 switch (blocksize_requested) {
706 case 3: RUN_CASE(3);
707 case 5: RUN_CASE(5);
708 case 7: RUN_CASE(7);
709 case 9: RUN_CASE(9);
710 case 10: RUN_CASE(10);
711 case 11: RUN_CASE(11);
712 case 16: RUN_CASE(16);
713 case 17: RUN_CASE(17);
714 case 18: RUN_CASE(18);
715 default: RUN_CASE(0);
716 }
717#undef RUN_CASE
718}
719
720template <typename MatrixType>
721void ComputeResidualAndSolve<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::run_pass1_of_2(
722 const Const<impl_scalar_type_2d_view_tpetra>& b_,
723 const impl_scalar_type_2d_view_tpetra& x_,
724 const impl_scalar_type_2d_view_tpetra& y_) {
725#define RUN_CASE(B) \
726 { \
727 ComputeResidualAndSolve_2Pass_Impl<MatrixType, B> functor(amd, d_inv, W, blocksize_requested, damping_factor); \
728 functor.run_pass1(b_, x_, y_); \
729 break; \
730 }
731
732 switch (blocksize_requested) {
733 case 3: RUN_CASE(3);
734 case 5: RUN_CASE(5);
735 case 7: RUN_CASE(7);
736 case 9: RUN_CASE(9);
737 case 10: RUN_CASE(10);
738 case 11: RUN_CASE(11);
739 case 16: RUN_CASE(16);
740 case 17: RUN_CASE(17);
741 case 18: RUN_CASE(18);
742 default: RUN_CASE(0);
743 }
744#undef RUN_CASE
745}
746
747template <typename MatrixType>
748void ComputeResidualAndSolve<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>::run_pass2_of_2(
749 const impl_scalar_type_2d_view_tpetra& x_,
750 const impl_scalar_type_2d_view_tpetra& x_remote_,
751 const impl_scalar_type_2d_view_tpetra& y_) {
752#define RUN_CASE(B) \
753 { \
754 ComputeResidualAndSolve_2Pass_Impl<MatrixType, B> functor(amd, d_inv, W, blocksize_requested, damping_factor); \
755 functor.run_pass2(x_, x_remote_, y_); \
756 break; \
757 }
758
759 switch (blocksize_requested) {
760 case 3: RUN_CASE(3);
761 case 5: RUN_CASE(5);
762 case 7: RUN_CASE(7);
763 case 9: RUN_CASE(9);
764 case 10: RUN_CASE(10);
765 case 11: RUN_CASE(11);
766 case 16: RUN_CASE(16);
767 case 17: RUN_CASE(17);
768 case 18: RUN_CASE(18);
769 default: RUN_CASE(0);
770 }
771#undef RUN_CASE
772}
773
774} // namespace Ifpack2::BlockHelperDetails
775
776#define IFPACK2_BLOCKCOMPUTERESIDUALANDSOLVE_INSTANT(S, LO, GO, N) \
777 template class Ifpack2::BlockHelperDetails::ComputeResidualAndSolve<Tpetra::RowMatrix<S, LO, GO, N> >;
778
779#endif
size_t size_type
Definition Ifpack2_BlockHelper.hpp:274
node_type::device_type node_device_type
Definition Ifpack2_BlockHelper.hpp:298
KokkosKernels::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition Ifpack2_BlockHelper.hpp:284
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition Ifpack2_BlockHelper.hpp:347