Ifpack2 Templated Preconditioning Package Version 1.0
Loading...
Searching...
No Matches
Ifpack2_BlockComputeResidualAndSolve.hpp
1// @HEADER
2// *****************************************************************************
3// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4//
5// Copyright 2009 NTESS and the Ifpack2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
11#define IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
12
13#include "Ifpack2_BlockHelper.hpp"
14#include "Ifpack2_BlockComputeResidualVector.hpp"
15
16namespace Ifpack2::BlockHelperDetails {
17
18template <typename ExecSpace, typename DiagOffsets, typename Rowptrs,
19 typename Entries>
20DiagOffsets findDiagOffsets(const Rowptrs& rowptrs, const Entries& entries,
21 size_t nrows, int blocksize) {
22 DiagOffsets diag_offsets(do_not_initialize_tag("btdm.diag_offsets"), nrows);
23 int err1 = 0;
24 Kokkos::parallel_reduce(
25 Kokkos::RangePolicy<ExecSpace>(0, nrows),
26 KOKKOS_LAMBDA(size_t row, int& err2) {
27 auto rowBegin = rowptrs(row);
28 auto rowEnd = rowptrs(row + 1);
29 for (size_t j = rowBegin; j < rowEnd; j++) {
30 if (size_t(entries(j)) == row) {
31 diag_offsets(row) = j * blocksize * blocksize;
32 return;
33 }
34 }
35 err2++;
36 },
37 err1);
38 TEUCHOS_TEST_FOR_EXCEPT_MSG(
39 err1, "Ifpack2 BTD: at least one row has no diagonal entry");
40 return diag_offsets;
41}
42
43template <typename MatrixType, int B>
44struct ComputeResidualAndSolve_1Pass {
45 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
46 using node_device_type = typename impl_type::node_device_type;
47 using execution_space = typename impl_type::execution_space;
48 using memory_space = typename impl_type::memory_space;
49
50 using local_ordinal_type = typename impl_type::local_ordinal_type;
51 using size_type = typename impl_type::size_type;
52 using impl_scalar_type = typename impl_type::impl_scalar_type;
53 using magnitude_type = typename impl_type::magnitude_type;
55 using local_ordinal_type_1d_view =
56 typename impl_type::local_ordinal_type_1d_view;
57 using size_type_1d_view = typename impl_type::size_type_1d_view;
58 using tpetra_block_access_view_type =
59 typename impl_type::tpetra_block_access_view_type; // block crs (layout
60 // right)
61 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
62 using impl_scalar_type_2d_view_tpetra =
63 typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
64 // (layout left)
65 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
66 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
67 using i64_3d_view = typename impl_type::i64_3d_view;
68
70 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
71
72 // enum for max blocksize and vector length
73 enum : int { max_blocksize = 32 };
74
75 private:
76 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
77 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
78 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
79 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
80
81 // AmD information
82 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
83
84 // blocksize
85 const local_ordinal_type blocksize_requested;
86
87 // block offsets
88 const ConstUnmanaged<i64_3d_view> A_x_offsets;
89 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
90
91 // diagonal block inverses
92 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
93
94 // squared update norms
95 const Unmanaged<impl_scalar_type_1d_view> W;
96
97 impl_scalar_type damping_factor;
98
99 public:
100 ComputeResidualAndSolve_1Pass(const AmD<MatrixType>& amd,
101 const btdm_scalar_type_3d_view& d_inv_,
102 const impl_scalar_type_1d_view& W_,
103 const local_ordinal_type& blocksize_requested_,
104 const impl_scalar_type& damping_factor_)
105 : tpetra_values(amd.tpetra_values)
106 , blocksize_requested(blocksize_requested_)
107 , A_x_offsets(amd.A_x_offsets)
108 , A_x_offsets_remote(amd.A_x_offsets_remote)
109 , d_inv(d_inv_)
110 , W(W_)
111 , damping_factor(damping_factor_) {}
112
113 KOKKOS_INLINE_FUNCTION
114 void operator()(const member_type& member) const {
115 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
116 const local_ordinal_type rowidx = member.league_rank();
117 const local_ordinal_type row = rowidx * blocksize;
118 const local_ordinal_type num_vectors = b.extent(1);
119 const local_ordinal_type num_local_rows = d_inv.extent(0);
120
121 const impl_scalar_type* xx;
122 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
123 tpetra_values.data(), blocksize, blocksize);
124
125 // Get shared allocation for a local copy of x, residual, and A
126 impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
127 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
128 impl_scalar_type* local_Dinv_residual = reinterpret_cast<impl_scalar_type*>(
129 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
130 impl_scalar_type* local_x =
131 reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
132 blocksize * sizeof(impl_scalar_type)));
133
134 magnitude_type norm = 0;
135 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
136 if (col) member.team_barrier();
137 // y -= Rx
138 // Initialize accumulation arrays
139 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
140 [&](const local_ordinal_type& i) {
141 local_Dinv_residual[i] = 0;
142 local_residual[i] = b(row + i, col);
143 });
144 member.team_barrier();
145
146 int numEntries = A_x_offsets.extent(2);
147
148 Kokkos::parallel_for(
149 Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
150 int64_t A_offset = A_x_offsets(rowidx, 0, k);
151 int64_t x_offset = A_x_offsets(rowidx, 1, k);
152#if KOKKOS_VERSION >= 40799
153 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
154#else
155 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
156#endif
157 A_block_cst.assign_data(tpetra_values.data() + A_offset);
158 // Pull x into local memory
159 int64_t remote_cutoff = blocksize * num_local_rows;
160 if (x_offset >= remote_cutoff)
161 xx = &x_remote(x_offset - remote_cutoff, col);
162 else
163 xx = &x(x_offset, col);
164
165 Kokkos::parallel_for(
166 Kokkos::ThreadVectorRange(member, blocksize),
167 [&](const local_ordinal_type& i) { local_x[i] = xx[i]; });
168
169 // matvec on block: local_residual -= A_block_cst * local_x
170 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
171 [&](const int k0) {
172 impl_scalar_type val = 0;
173 for (int k1 = 0; k1 < blocksize; k1++)
174 val += A_block_cst(k0, k1) * local_x[k1];
175 Kokkos::atomic_add(local_residual + k0, -val);
176 });
177 }
178 });
179 member.team_barrier();
180 // Compute local_Dinv_residual = D^-1 * local_residual
181 Kokkos::parallel_for(
182 Kokkos::TeamThreadRange(member, blocksize),
183 [&](const local_ordinal_type& k0) {
184 Kokkos::parallel_reduce(
185 Kokkos::ThreadVectorRange(member, blocksize),
186 [&](const local_ordinal_type& k1, impl_scalar_type& update) {
187 update += d_inv(rowidx, k0, k1) * local_residual[k1];
188 },
189 local_Dinv_residual[k0]);
190 });
191 member.team_barrier();
192 // local_Dinv_residual is fully computed. Now compute the
193 // squared y update norm and update y (using damping factor).
194 magnitude_type colNorm;
195 Kokkos::parallel_reduce(
196 Kokkos::TeamVectorRange(member, blocksize),
197 [&](const local_ordinal_type& k, magnitude_type& update) {
198 // Compute the change in y (assuming damping_factor == 1) for this
199 // entry.
200 impl_scalar_type old_y = x(row + k, col);
201 impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
202#if KOKKOS_VERSION >= 40799
203 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
204#else
205 if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
206#endif
207 magnitude_type ydiff =
208#if KOKKOS_VERSION >= 40799
209 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
210#else
211 Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
212#endif
213 update += ydiff * ydiff;
214 } else {
215 update += y_update * y_update;
216 }
217 y(row + k, col) = old_y + damping_factor * y_update;
218 },
219 colNorm);
220 norm += colNorm;
221 }
222 Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
223 }
224
225 // Launch SinglePass version (owned + nonowned residual, plus Dinv in a single
226 // kernel)
227 void run(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
228 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
229 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
230 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
231 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
232 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
233 "BlockTriDi::ComputeResidualAndSolve::RunSinglePass",
234 ComputeResidualAndSolve0, execution_space);
235
236 y = y_;
237 b = b_;
238 x = x_;
239 x_remote = x_remote_;
240
241 const local_ordinal_type blocksize = blocksize_requested;
242 const local_ordinal_type nrows = d_inv.extent(0);
243
244 const local_ordinal_type team_size = 8;
245 const local_ordinal_type vector_size = 8;
246 // team: local_residual, local_Dinv_residual
247 const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type);
248 // thread: local_x
249 const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
250 Kokkos::TeamPolicy<execution_space> policy(nrows, team_size, vector_size);
251 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
252 Kokkos::PerThread(shmem_thread_size));
253 Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::SinglePass",
254 policy, *this);
255 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
256 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
257 }
258};
259
260template <typename MatrixType, int B>
261struct ComputeResidualAndSolve_2Pass {
262 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
263 using node_device_type = typename impl_type::node_device_type;
264 using execution_space = typename impl_type::execution_space;
265 using memory_space = typename impl_type::memory_space;
266
267 using local_ordinal_type = typename impl_type::local_ordinal_type;
268 using size_type = typename impl_type::size_type;
269 using impl_scalar_type = typename impl_type::impl_scalar_type;
270 using magnitude_type = typename impl_type::magnitude_type;
272 using local_ordinal_type_1d_view =
273 typename impl_type::local_ordinal_type_1d_view;
274 using size_type_1d_view = typename impl_type::size_type_1d_view;
275 using tpetra_block_access_view_type =
276 typename impl_type::tpetra_block_access_view_type; // block crs (layout
277 // right)
278 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
279 using impl_scalar_type_2d_view_tpetra =
280 typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
281 // (layout left)
282 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
283 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
284 using i64_3d_view = typename impl_type::i64_3d_view;
285
287 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
288
289 // enum for max blocksize and vector length
290 enum : int { max_blocksize = 32 };
291
292 // Tag for computing residual with owned columns only (pass 1)
293 struct OwnedTag {};
294
295 // Tag for finishing the residual with nonowned columns, and solving/norming
296 // (pass 2)
297 struct NonownedTag {};
298
299 private:
300 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
301 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
302 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
303 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
304
305 // AmD information
306 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
307
308 // blocksize
309 const local_ordinal_type blocksize_requested;
310
311 // block offsets
312 const ConstUnmanaged<i64_3d_view> A_x_offsets;
313 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
314
315 // diagonal block inverses
316 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
317
318 // squared update norms
319 const Unmanaged<impl_scalar_type_1d_view> W;
320
321 impl_scalar_type damping_factor;
322
323 public:
324 ComputeResidualAndSolve_2Pass(const AmD<MatrixType>& amd,
325 const btdm_scalar_type_3d_view& d_inv_,
326 const impl_scalar_type_1d_view& W_,
327 const local_ordinal_type& blocksize_requested_,
328 const impl_scalar_type& damping_factor_)
329 : tpetra_values(amd.tpetra_values)
330 , blocksize_requested(blocksize_requested_)
331 , A_x_offsets(amd.A_x_offsets)
332 , A_x_offsets_remote(amd.A_x_offsets_remote)
333 , d_inv(d_inv_)
334 , W(W_)
335 , damping_factor(damping_factor_) {}
336
337 KOKKOS_INLINE_FUNCTION
338 void operator()(const OwnedTag, const member_type& member) const {
339 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
340 const local_ordinal_type rowidx = member.league_rank();
341 const local_ordinal_type row = rowidx * blocksize;
342 const local_ordinal_type num_vectors = b.extent(1);
343
344 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
345 tpetra_values.data(), blocksize, blocksize);
346
347 // Get shared allocation for a local copy of x, Ax, and A
348 impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
349 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
350 impl_scalar_type* local_x =
351 reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
352 blocksize * sizeof(impl_scalar_type)));
353
354 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
355 if (col) member.team_barrier();
356 // y -= Rx
357 // Initialize accumulation arrays
358 Kokkos::parallel_for(
359 Kokkos::TeamVectorRange(member, blocksize),
360 [&](const local_ordinal_type& i) { local_residual[i] = b(row + i, col); });
361 member.team_barrier();
362
363 int numEntries = A_x_offsets.extent(2);
364
365 Kokkos::parallel_for(
366 Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
367 int64_t A_offset = A_x_offsets(rowidx, 0, k);
368 int64_t x_offset = A_x_offsets(rowidx, 1, k);
369#if KOKKOS_VERSION >= 40799
370 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
371#else
372 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
373#endif
374 A_block_cst.assign_data(tpetra_values.data() + A_offset);
375 // Pull x into local memory
376 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
377 [&](const local_ordinal_type& i) {
378 local_x[i] = x(x_offset + i, col);
379 });
380
381 // MatVec op Ax += A*x
382 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
383 [&](const local_ordinal_type& k0) {
384 impl_scalar_type val = 0;
385 for (int k1 = 0; k1 < blocksize; k1++)
386 val += A_block_cst(k0, k1) * local_x[k1];
387 Kokkos::atomic_add(local_residual + k0, -val);
388 });
389 }
390 });
391 member.team_barrier();
392 // Write back the partial residual to y
393 if (member.team_rank() == 0) {
394 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
395 [&](const local_ordinal_type& k) {
396 y(row + k, col) = local_residual[k];
397 });
398 }
399 }
400 }
401
402 KOKKOS_INLINE_FUNCTION
403 void operator()(const NonownedTag, const member_type& member) const {
404 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
405 const local_ordinal_type rowidx = member.league_rank();
406 const local_ordinal_type row = rowidx * blocksize;
407 const local_ordinal_type num_vectors = b.extent(1);
408
409 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
410 tpetra_values.data(), blocksize, blocksize);
411
412 // Get shared allocation for a local copy of x, Ax, and A
413 impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
414 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
415 impl_scalar_type* local_Dinv_residual = reinterpret_cast<impl_scalar_type*>(
416 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
417 impl_scalar_type* local_x =
418 reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
419 blocksize * sizeof(impl_scalar_type)));
420
421 magnitude_type norm = 0;
422 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
423 if (col) member.team_barrier();
424 // y -= Rx
425 // Initialize accumulation arrays.
426 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
427 [&](const local_ordinal_type& i) {
428 local_Dinv_residual[i] = 0;
429 local_residual[i] = y(row + i, col);
430 });
431 member.team_barrier();
432
433 int numEntries = A_x_offsets_remote.extent(2);
434
435 Kokkos::parallel_for(
436 Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
437 int64_t A_offset = A_x_offsets_remote(rowidx, 0, k);
438 int64_t x_offset = A_x_offsets_remote(rowidx, 1, k);
439#if KOKKOS_VERSION >= 40799
440 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
441#else
442 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
443#endif
444 A_block_cst.assign_data(tpetra_values.data() + A_offset);
445 // Pull x into local memory
446 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
447 [&](const local_ordinal_type& i) {
448 local_x[i] = x_remote(x_offset + i, col);
449 });
450
451 // matvec on block: local_residual -= A_block_cst * local_x
452 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
453 [&](const int k0) {
454 impl_scalar_type val = 0;
455 for (int k1 = 0; k1 < blocksize; k1++)
456 val += A_block_cst(k0, k1) * local_x[k1];
457 Kokkos::atomic_add(local_residual + k0, -val);
458 });
459 }
460 });
461 member.team_barrier();
462 // Compute local_Dinv_residual = D^-1 * local_residual
463 Kokkos::parallel_for(
464 Kokkos::TeamThreadRange(member, blocksize),
465 [&](const local_ordinal_type& k0) {
466 Kokkos::parallel_reduce(
467 Kokkos::ThreadVectorRange(member, blocksize),
468 [&](const local_ordinal_type& k1, impl_scalar_type& update) {
469 update += d_inv(rowidx, k0, k1) * local_residual[k1];
470 },
471 local_Dinv_residual[k0]);
472 });
473 member.team_barrier();
474 // local_Dinv_residual is fully computed. Now compute the
475 // squared y update norm and update y (using damping factor).
476 magnitude_type colNorm;
477 Kokkos::parallel_reduce(
478 Kokkos::TeamVectorRange(member, blocksize),
479 [&](const local_ordinal_type& k, magnitude_type& update) {
480 // Compute the change in y (assuming damping_factor == 1) for this
481 // entry.
482 impl_scalar_type old_y = x(row + k, col);
483 impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
484#if KOKKOS_VERSION >= 40799
485 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
486#else
487 if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
488#endif
489 magnitude_type ydiff =
490#if KOKKOS_VERSION >= 40799
491 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
492#else
493 Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
494#endif
495 update += ydiff * ydiff;
496 } else {
497 update += y_update * y_update;
498 }
499 y(row + k, col) = old_y + damping_factor * y_update;
500 },
501 colNorm);
502 norm += colNorm;
503 }
504 Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
505 }
506
507 // Launch pass 1 of the 2-pass version.
508 // This computes just the owned part of residual and writes that back to y.
509 void run_pass1(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
510 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
511 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
512 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
513 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
514 "BlockTriDi::ComputeResidualAndSolve::RunPass1",
515 ComputeResidualAndSolve0, execution_space);
516
517 b = b_;
518 x = x_;
519 y = y_;
520
521 const local_ordinal_type blocksize = blocksize_requested;
522 const local_ordinal_type nrows = d_inv.extent(0);
523
524 const local_ordinal_type team_size = 8;
525 const local_ordinal_type vector_size = 8;
526 const size_t shmem_team_size = blocksize * sizeof(impl_scalar_type);
527 const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
528 Kokkos::TeamPolicy<execution_space, OwnedTag> policy(nrows, team_size,
529 vector_size);
530 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
531 Kokkos::PerThread(shmem_thread_size));
532 Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass1", policy,
533 *this);
534 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
535 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
536 }
537
538 // Launch pass 2 of the 2-pass version.
539 // This finishes computing residual with x_remote,
540 // and then applies Dinv and computes norm.
541 void run_pass2(
542 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
543 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
544 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
545 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
546 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
547 "BlockTriDi::ComputeResidualAndSolve::RunPass2",
548 ComputeResidualAndSolve0, execution_space);
549
550 x = x_;
551 x_remote = x_remote_;
552 y = y_;
553
554 const local_ordinal_type blocksize = blocksize_requested;
555 const local_ordinal_type nrows = d_inv.extent(0);
556
557 const local_ordinal_type team_size = 8;
558 const local_ordinal_type vector_size = 8;
559 const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type);
560 const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
561 Kokkos::TeamPolicy<execution_space, NonownedTag> policy(nrows, team_size,
562 vector_size);
563 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
564 Kokkos::PerThread(shmem_thread_size));
565 Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass2", policy,
566 *this);
567 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
568 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
569 }
570};
571
572template <typename MatrixType, int B>
573struct ComputeResidualAndSolve_SolveOnly {
574 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
575 using node_device_type = typename impl_type::node_device_type;
576 using execution_space = typename impl_type::execution_space;
577 using memory_space = typename impl_type::memory_space;
578
579 using local_ordinal_type = typename impl_type::local_ordinal_type;
580 using size_type = typename impl_type::size_type;
581 using impl_scalar_type = typename impl_type::impl_scalar_type;
582 using magnitude_type = typename impl_type::magnitude_type;
584 using local_ordinal_type_1d_view =
585 typename impl_type::local_ordinal_type_1d_view;
586 using size_type_1d_view = typename impl_type::size_type_1d_view;
587 using tpetra_block_access_view_type =
588 typename impl_type::tpetra_block_access_view_type; // block crs (layout
589 // right)
590 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
591 using impl_scalar_type_2d_view_tpetra =
592 typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
593 // (layout left)
594 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
595 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
596 using i64_3d_view = typename impl_type::i64_3d_view;
597
599 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
600
601 // enum for max blocksize and vector length
602 enum : int { max_blocksize = 32 };
603
604 private:
605 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
606 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
607 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
608 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
609
610 // AmD information
611 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
612
613 // blocksize
614 const local_ordinal_type blocksize_requested;
615
616 // block offsets
617 const ConstUnmanaged<i64_3d_view> A_x_offsets;
618 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
619
620 // diagonal block inverses
621 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
622
623 // squared update norms
624 const Unmanaged<impl_scalar_type_1d_view> W;
625
626 impl_scalar_type damping_factor;
627
628 public:
629 ComputeResidualAndSolve_SolveOnly(
630 const AmD<MatrixType>& amd, const btdm_scalar_type_3d_view& d_inv_,
631 const impl_scalar_type_1d_view& W_,
632 const local_ordinal_type& blocksize_requested_,
633 const impl_scalar_type& damping_factor_)
634 : tpetra_values(amd.tpetra_values)
635 , blocksize_requested(blocksize_requested_)
636 , A_x_offsets(amd.A_x_offsets)
637 , A_x_offsets_remote(amd.A_x_offsets_remote)
638 , d_inv(d_inv_)
639 , W(W_)
640 , damping_factor(damping_factor_) {}
641
642 KOKKOS_INLINE_FUNCTION
643 void operator()(const member_type& member) const {
644 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
645 const local_ordinal_type rowidx =
646 member.league_rank() * member.team_size() + member.team_rank();
647 const local_ordinal_type row = rowidx * blocksize;
648 const local_ordinal_type num_vectors = b.extent(1);
649
650 // Get shared allocation for a local copy of x, Ax, and A
651 impl_scalar_type* local_Dinv_residual =
652 reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
653 blocksize * sizeof(impl_scalar_type)));
654
655 if (rowidx >= (local_ordinal_type)d_inv.extent(0)) return;
656
657 magnitude_type norm = 0;
658 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
659 // Compute local_Dinv_residual = D^-1 * local_residual
660 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
661 [&](const local_ordinal_type& k0) {
662 impl_scalar_type val = 0;
663 for (local_ordinal_type k1 = 0; k1 < blocksize;
664 k1++) {
665 val += d_inv(rowidx, k0, k1) * b(row + k1, col);
666 }
667 local_Dinv_residual[k0] = val;
668 });
669
670 magnitude_type colNorm;
671 Kokkos::parallel_reduce(
672 Kokkos::ThreadVectorRange(member, blocksize),
673 [&](const local_ordinal_type& k, magnitude_type& update) {
674 // Compute the change in y (assuming damping_factor == 1) for this
675 // entry.
676 impl_scalar_type y_update = local_Dinv_residual[k];
677#if KOKKOS_VERSION >= 40799
678 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
679#else
680 if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
681#endif
682 magnitude_type ydiff =
683#if KOKKOS_VERSION >= 40799
684 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
685#else
686 Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
687#endif
688 update += ydiff * ydiff;
689 } else {
690 update += y_update * y_update;
691 }
692 y(row + k, col) = damping_factor * y_update;
693 },
694 colNorm);
695 norm += colNorm;
696 }
697 Kokkos::single(Kokkos::PerThread(member), [&]() { W(rowidx) = norm; });
698 }
699
700 // ComputeResidualAndSolve_SolveOnly::run does the solve for the first
701 // iteration, when the initial guess for y is zero. This means the residual
702 // vector is just b. The kernel applies the inverse diags to b to find y, and
703 // also puts the partial squared update norms (1 per row) into W.
704 void run(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
705 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
706 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
707 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
708 "BlockTriDi::ComputeResidualAndSolve::Run_Y_Zero",
709 ComputeResidualAndSolve0, execution_space);
710
711 y = y_;
712 b = b_;
713
714 const local_ordinal_type blocksize = blocksize_requested;
715 const local_ordinal_type nrows = d_inv.extent(0);
716
717 const local_ordinal_type team_size = 8;
718 const local_ordinal_type vector_size = 8;
719 const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
720 Kokkos::TeamPolicy<execution_space> policy(
721 (nrows + team_size - 1) / team_size, team_size, vector_size);
722 policy.set_scratch_size(0, Kokkos::PerThread(shmem_thread_size));
723 Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::y_zero", policy,
724 *this);
725 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
726 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
727 }
728};
729
730} // namespace Ifpack2::BlockHelperDetails
731
732#endif
size_t size_type
Definition Ifpack2_BlockHelper.hpp:274
Kokkos::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition Ifpack2_BlockHelper.hpp:286
node_type::device_type node_device_type
Definition Ifpack2_BlockHelper.hpp:309
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition Ifpack2_BlockHelper.hpp:358