Ifpack2 Templated Preconditioning Package Version 1.0
Loading...
Searching...
No Matches
Ifpack2_BlockComputeResidualAndSolve.hpp
1// @HEADER
2// *****************************************************************************
3// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4//
5// Copyright 2009 NTESS and the Ifpack2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
11#define IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
12
13#include "Ifpack2_BlockHelper.hpp"
14#include "Ifpack2_BlockComputeResidualVector.hpp"
15
16namespace Ifpack2::BlockHelperDetails {
17
18template <typename ExecSpace, typename DiagOffsets, typename Rowptrs,
19 typename Entries>
20DiagOffsets findDiagOffsets(const Rowptrs& rowptrs, const Entries& entries,
21 size_t nrows, int blocksize) {
22 DiagOffsets diag_offsets(do_not_initialize_tag("btdm.diag_offsets"), nrows);
23 int err1 = 0;
24 Kokkos::parallel_reduce(
25 Kokkos::RangePolicy<ExecSpace>(0, nrows),
26 KOKKOS_LAMBDA(size_t row, int& err2) {
27 auto rowBegin = rowptrs(row);
28 auto rowEnd = rowptrs(row + 1);
29 for (size_t j = rowBegin; j < rowEnd; j++) {
30 if (size_t(entries(j)) == row) {
31 diag_offsets(row) = j * blocksize * blocksize;
32 return;
33 }
34 }
35 err2++;
36 },
37 err1);
38 TEUCHOS_TEST_FOR_EXCEPT_MSG(
39 err1, "Ifpack2 BTD: at least one row has no diagonal entry");
40 return diag_offsets;
41}
42
43template <typename MatrixType, int B>
44struct ComputeResidualAndSolve_1Pass {
45 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
46 using node_device_type = typename impl_type::node_device_type;
47 using execution_space = typename impl_type::execution_space;
48 using memory_space = typename impl_type::memory_space;
49
50 using local_ordinal_type = typename impl_type::local_ordinal_type;
51 using size_type = typename impl_type::size_type;
52 using impl_scalar_type = typename impl_type::impl_scalar_type;
53 using magnitude_type = typename impl_type::magnitude_type;
55 using local_ordinal_type_1d_view =
56 typename impl_type::local_ordinal_type_1d_view;
57 using size_type_1d_view = typename impl_type::size_type_1d_view;
58 using tpetra_block_access_view_type =
59 typename impl_type::tpetra_block_access_view_type; // block crs (layout
60 // right)
61 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
62 using impl_scalar_type_2d_view_tpetra =
63 typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
64 // (layout left)
65 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
66 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
67 using i64_3d_view = typename impl_type::i64_3d_view;
68
70 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
71
72 // enum for max blocksize and vector length
73 enum : int { max_blocksize = 32 };
74
75 private:
76 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
77 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
78 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
79 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
80
81 // AmD information
82 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
83
84 // blocksize
85 const local_ordinal_type blocksize_requested;
86
87 // block offsets
88 const ConstUnmanaged<i64_3d_view> A_x_offsets;
89 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
90
91 // diagonal block inverses
92 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
93
94 // squared update norms
95 const Unmanaged<impl_scalar_type_1d_view> W;
96
97 impl_scalar_type damping_factor;
98
99 public:
100 ComputeResidualAndSolve_1Pass(const AmD<MatrixType>& amd,
101 const btdm_scalar_type_3d_view& d_inv_,
102 const impl_scalar_type_1d_view& W_,
103 const local_ordinal_type& blocksize_requested_,
104 const impl_scalar_type& damping_factor_)
105 : tpetra_values(amd.tpetra_values)
106 , blocksize_requested(blocksize_requested_)
107 , A_x_offsets(amd.A_x_offsets)
108 , A_x_offsets_remote(amd.A_x_offsets_remote)
109 , d_inv(d_inv_)
110 , W(W_)
111 , damping_factor(damping_factor_) {}
112
113 KOKKOS_INLINE_FUNCTION
114 void operator()(const member_type& member) const {
115 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
116 const local_ordinal_type rowidx = member.league_rank();
117 const local_ordinal_type row = rowidx * blocksize;
118 const local_ordinal_type num_vectors = b.extent(1);
119 const local_ordinal_type num_local_rows = d_inv.extent(0);
120
121 const impl_scalar_type* xx;
122 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
123 tpetra_values.data(), blocksize, blocksize);
124
125 // Get shared allocation for a local copy of x, residual, and A
126 impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
127 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
128 impl_scalar_type* local_Dinv_residual = reinterpret_cast<impl_scalar_type*>(
129 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
130 impl_scalar_type* local_x =
131 reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
132 blocksize * sizeof(impl_scalar_type)));
133
134 magnitude_type norm = 0;
135 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
136 if (col) member.team_barrier();
137 // y -= Rx
138 // Initialize accumulation arrays
139 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
140 [&](const local_ordinal_type& i) {
141 local_Dinv_residual[i] = 0;
142 local_residual[i] = b(row + i, col);
143 });
144 member.team_barrier();
145
146 int numEntries = A_x_offsets.extent(2);
147
148 Kokkos::parallel_for(
149 Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
150 int64_t A_offset = A_x_offsets(rowidx, 0, k);
151 int64_t x_offset = A_x_offsets(rowidx, 1, k);
152 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
153 A_block_cst.assign_data(tpetra_values.data() + A_offset);
154 // Pull x into local memory
155 int64_t remote_cutoff = blocksize * num_local_rows;
156 if (x_offset >= remote_cutoff)
157 xx = &x_remote(x_offset - remote_cutoff, col);
158 else
159 xx = &x(x_offset, col);
160
161 Kokkos::parallel_for(
162 Kokkos::ThreadVectorRange(member, blocksize),
163 [&](const local_ordinal_type& i) { local_x[i] = xx[i]; });
164
165 // matvec on block: local_residual -= A_block_cst * local_x
166 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
167 [&](const int k0) {
168 impl_scalar_type val = 0;
169 for (int k1 = 0; k1 < blocksize; k1++)
170 val += A_block_cst(k0, k1) * local_x[k1];
171 Kokkos::atomic_add(local_residual + k0, -val);
172 });
173 }
174 });
175 member.team_barrier();
176 // Compute local_Dinv_residual = D^-1 * local_residual
177 Kokkos::parallel_for(
178 Kokkos::TeamThreadRange(member, blocksize),
179 [&](const local_ordinal_type& k0) {
180 Kokkos::parallel_reduce(
181 Kokkos::ThreadVectorRange(member, blocksize),
182 [&](const local_ordinal_type& k1, impl_scalar_type& update) {
183 update += d_inv(rowidx, k0, k1) * local_residual[k1];
184 },
185 local_Dinv_residual[k0]);
186 });
187 member.team_barrier();
188 // local_Dinv_residual is fully computed. Now compute the
189 // squared y update norm and update y (using damping factor).
190 magnitude_type colNorm;
191 Kokkos::parallel_reduce(
192 Kokkos::TeamVectorRange(member, blocksize),
193 [&](const local_ordinal_type& k, magnitude_type& update) {
194 // Compute the change in y (assuming damping_factor == 1) for this
195 // entry.
196 impl_scalar_type old_y = x(row + k, col);
197 impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
198 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
199 magnitude_type ydiff =
200 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
201 update += ydiff * ydiff;
202 } else {
203 update += y_update * y_update;
204 }
205 y(row + k, col) = old_y + damping_factor * y_update;
206 },
207 colNorm);
208 norm += colNorm;
209 }
210 Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
211 }
212
213 // Launch SinglePass version (owned + nonowned residual, plus Dinv in a single
214 // kernel)
215 void run(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
216 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
217 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
218 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
219 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
220 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
221 "BlockTriDi::ComputeResidualAndSolve::RunSinglePass",
222 ComputeResidualAndSolve0, execution_space);
223
224 y = y_;
225 b = b_;
226 x = x_;
227 x_remote = x_remote_;
228
229 const local_ordinal_type blocksize = blocksize_requested;
230 const local_ordinal_type nrows = d_inv.extent(0);
231
232 const local_ordinal_type team_size = 8;
233 const local_ordinal_type vector_size = 8;
234 // team: local_residual, local_Dinv_residual
235 const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type);
236 // thread: local_x
237 const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
238 Kokkos::TeamPolicy<execution_space> policy(nrows, team_size, vector_size);
239 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
240 Kokkos::PerThread(shmem_thread_size));
241 Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::SinglePass",
242 policy, *this);
243 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
244 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
245 }
246};
247
248template <typename MatrixType, int B>
249struct ComputeResidualAndSolve_2Pass {
250 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
251 using node_device_type = typename impl_type::node_device_type;
252 using execution_space = typename impl_type::execution_space;
253 using memory_space = typename impl_type::memory_space;
254
255 using local_ordinal_type = typename impl_type::local_ordinal_type;
256 using size_type = typename impl_type::size_type;
257 using impl_scalar_type = typename impl_type::impl_scalar_type;
258 using magnitude_type = typename impl_type::magnitude_type;
260 using local_ordinal_type_1d_view =
261 typename impl_type::local_ordinal_type_1d_view;
262 using size_type_1d_view = typename impl_type::size_type_1d_view;
263 using tpetra_block_access_view_type =
264 typename impl_type::tpetra_block_access_view_type; // block crs (layout
265 // right)
266 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
267 using impl_scalar_type_2d_view_tpetra =
268 typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
269 // (layout left)
270 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
271 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
272 using i64_3d_view = typename impl_type::i64_3d_view;
273
275 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
276
277 // enum for max blocksize and vector length
278 enum : int { max_blocksize = 32 };
279
280 // Tag for computing residual with owned columns only (pass 1)
281 struct OwnedTag {};
282
283 // Tag for finishing the residual with nonowned columns, and solving/norming
284 // (pass 2)
285 struct NonownedTag {};
286
287 private:
288 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
289 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
290 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
291 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
292
293 // AmD information
294 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
295
296 // blocksize
297 const local_ordinal_type blocksize_requested;
298
299 // block offsets
300 const ConstUnmanaged<i64_3d_view> A_x_offsets;
301 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
302
303 // diagonal block inverses
304 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
305
306 // squared update norms
307 const Unmanaged<impl_scalar_type_1d_view> W;
308
309 impl_scalar_type damping_factor;
310
311 public:
312 ComputeResidualAndSolve_2Pass(const AmD<MatrixType>& amd,
313 const btdm_scalar_type_3d_view& d_inv_,
314 const impl_scalar_type_1d_view& W_,
315 const local_ordinal_type& blocksize_requested_,
316 const impl_scalar_type& damping_factor_)
317 : tpetra_values(amd.tpetra_values)
318 , blocksize_requested(blocksize_requested_)
319 , A_x_offsets(amd.A_x_offsets)
320 , A_x_offsets_remote(amd.A_x_offsets_remote)
321 , d_inv(d_inv_)
322 , W(W_)
323 , damping_factor(damping_factor_) {}
324
325 KOKKOS_INLINE_FUNCTION
326 void operator()(const OwnedTag, const member_type& member) const {
327 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
328 const local_ordinal_type rowidx = member.league_rank();
329 const local_ordinal_type row = rowidx * blocksize;
330 const local_ordinal_type num_vectors = b.extent(1);
331
332 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
333 tpetra_values.data(), blocksize, blocksize);
334
335 // Get shared allocation for a local copy of x, Ax, and A
336 impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
337 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
338 impl_scalar_type* local_x =
339 reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
340 blocksize * sizeof(impl_scalar_type)));
341
342 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
343 if (col) member.team_barrier();
344 // y -= Rx
345 // Initialize accumulation arrays
346 Kokkos::parallel_for(
347 Kokkos::TeamVectorRange(member, blocksize),
348 [&](const local_ordinal_type& i) { local_residual[i] = b(row + i, col); });
349 member.team_barrier();
350
351 int numEntries = A_x_offsets.extent(2);
352
353 Kokkos::parallel_for(
354 Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
355 int64_t A_offset = A_x_offsets(rowidx, 0, k);
356 int64_t x_offset = A_x_offsets(rowidx, 1, k);
357 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
358 A_block_cst.assign_data(tpetra_values.data() + A_offset);
359 // Pull x into local memory
360 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
361 [&](const local_ordinal_type& i) {
362 local_x[i] = x(x_offset + i, col);
363 });
364
365 // MatVec op Ax += A*x
366 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
367 [&](const local_ordinal_type& k0) {
368 impl_scalar_type val = 0;
369 for (int k1 = 0; k1 < blocksize; k1++)
370 val += A_block_cst(k0, k1) * local_x[k1];
371 Kokkos::atomic_add(local_residual + k0, -val);
372 });
373 }
374 });
375 member.team_barrier();
376 // Write back the partial residual to y
377 if (member.team_rank() == 0) {
378 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
379 [&](const local_ordinal_type& k) {
380 y(row + k, col) = local_residual[k];
381 });
382 }
383 }
384 }
385
386 KOKKOS_INLINE_FUNCTION
387 void operator()(const NonownedTag, const member_type& member) const {
388 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
389 const local_ordinal_type rowidx = member.league_rank();
390 const local_ordinal_type row = rowidx * blocksize;
391 const local_ordinal_type num_vectors = b.extent(1);
392
393 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
394 tpetra_values.data(), blocksize, blocksize);
395
396 // Get shared allocation for a local copy of x, Ax, and A
397 impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
398 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
399 impl_scalar_type* local_Dinv_residual = reinterpret_cast<impl_scalar_type*>(
400 member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
401 impl_scalar_type* local_x =
402 reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
403 blocksize * sizeof(impl_scalar_type)));
404
405 magnitude_type norm = 0;
406 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
407 if (col) member.team_barrier();
408 // y -= Rx
409 // Initialize accumulation arrays.
410 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
411 [&](const local_ordinal_type& i) {
412 local_Dinv_residual[i] = 0;
413 local_residual[i] = y(row + i, col);
414 });
415 member.team_barrier();
416
417 int numEntries = A_x_offsets_remote.extent(2);
418
419 Kokkos::parallel_for(
420 Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
421 int64_t A_offset = A_x_offsets_remote(rowidx, 0, k);
422 int64_t x_offset = A_x_offsets_remote(rowidx, 1, k);
423 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
424 A_block_cst.assign_data(tpetra_values.data() + A_offset);
425 // Pull x into local memory
426 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
427 [&](const local_ordinal_type& i) {
428 local_x[i] = x_remote(x_offset + i, col);
429 });
430
431 // matvec on block: local_residual -= A_block_cst * local_x
432 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
433 [&](const int k0) {
434 impl_scalar_type val = 0;
435 for (int k1 = 0; k1 < blocksize; k1++)
436 val += A_block_cst(k0, k1) * local_x[k1];
437 Kokkos::atomic_add(local_residual + k0, -val);
438 });
439 }
440 });
441 member.team_barrier();
442 // Compute local_Dinv_residual = D^-1 * local_residual
443 Kokkos::parallel_for(
444 Kokkos::TeamThreadRange(member, blocksize),
445 [&](const local_ordinal_type& k0) {
446 Kokkos::parallel_reduce(
447 Kokkos::ThreadVectorRange(member, blocksize),
448 [&](const local_ordinal_type& k1, impl_scalar_type& update) {
449 update += d_inv(rowidx, k0, k1) * local_residual[k1];
450 },
451 local_Dinv_residual[k0]);
452 });
453 member.team_barrier();
454 // local_Dinv_residual is fully computed. Now compute the
455 // squared y update norm and update y (using damping factor).
456 magnitude_type colNorm;
457 Kokkos::parallel_reduce(
458 Kokkos::TeamVectorRange(member, blocksize),
459 [&](const local_ordinal_type& k, magnitude_type& update) {
460 // Compute the change in y (assuming damping_factor == 1) for this
461 // entry.
462 impl_scalar_type old_y = x(row + k, col);
463 impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
464 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
465 magnitude_type ydiff =
466 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
467 update += ydiff * ydiff;
468 } else {
469 update += y_update * y_update;
470 }
471 y(row + k, col) = old_y + damping_factor * y_update;
472 },
473 colNorm);
474 norm += colNorm;
475 }
476 Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
477 }
478
479 // Launch pass 1 of the 2-pass version.
480 // This computes just the owned part of residual and writes that back to y.
481 void run_pass1(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
482 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
483 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
484 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
485 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
486 "BlockTriDi::ComputeResidualAndSolve::RunPass1",
487 ComputeResidualAndSolve0, execution_space);
488
489 b = b_;
490 x = x_;
491 y = y_;
492
493 const local_ordinal_type blocksize = blocksize_requested;
494 const local_ordinal_type nrows = d_inv.extent(0);
495
496 const local_ordinal_type team_size = 8;
497 const local_ordinal_type vector_size = 8;
498 const size_t shmem_team_size = blocksize * sizeof(impl_scalar_type);
499 const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
500 Kokkos::TeamPolicy<execution_space, OwnedTag> policy(nrows, team_size,
501 vector_size);
502 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
503 Kokkos::PerThread(shmem_thread_size));
504 Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass1", policy,
505 *this);
506 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
507 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
508 }
509
510 // Launch pass 2 of the 2-pass version.
511 // This finishes computing residual with x_remote,
512 // and then applies Dinv and computes norm.
513 void run_pass2(
514 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
515 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
516 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
517 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
518 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
519 "BlockTriDi::ComputeResidualAndSolve::RunPass2",
520 ComputeResidualAndSolve0, execution_space);
521
522 x = x_;
523 x_remote = x_remote_;
524 y = y_;
525
526 const local_ordinal_type blocksize = blocksize_requested;
527 const local_ordinal_type nrows = d_inv.extent(0);
528
529 const local_ordinal_type team_size = 8;
530 const local_ordinal_type vector_size = 8;
531 const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type);
532 const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
533 Kokkos::TeamPolicy<execution_space, NonownedTag> policy(nrows, team_size,
534 vector_size);
535 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
536 Kokkos::PerThread(shmem_thread_size));
537 Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass2", policy,
538 *this);
539 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
540 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
541 }
542};
543
544template <typename MatrixType, int B>
545struct ComputeResidualAndSolve_SolveOnly {
546 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
547 using node_device_type = typename impl_type::node_device_type;
548 using execution_space = typename impl_type::execution_space;
549 using memory_space = typename impl_type::memory_space;
550
551 using local_ordinal_type = typename impl_type::local_ordinal_type;
552 using size_type = typename impl_type::size_type;
553 using impl_scalar_type = typename impl_type::impl_scalar_type;
554 using magnitude_type = typename impl_type::magnitude_type;
556 using local_ordinal_type_1d_view =
557 typename impl_type::local_ordinal_type_1d_view;
558 using size_type_1d_view = typename impl_type::size_type_1d_view;
559 using tpetra_block_access_view_type =
560 typename impl_type::tpetra_block_access_view_type; // block crs (layout
561 // right)
562 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
563 using impl_scalar_type_2d_view_tpetra =
564 typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
565 // (layout left)
566 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
567 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
568 using i64_3d_view = typename impl_type::i64_3d_view;
569
571 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
572
573 // enum for max blocksize and vector length
574 enum : int { max_blocksize = 32 };
575
576 private:
577 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
578 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
579 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
580 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
581
582 // AmD information
583 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
584
585 // blocksize
586 const local_ordinal_type blocksize_requested;
587
588 // block offsets
589 const ConstUnmanaged<i64_3d_view> A_x_offsets;
590 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
591
592 // diagonal block inverses
593 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
594
595 // squared update norms
596 const Unmanaged<impl_scalar_type_1d_view> W;
597
598 impl_scalar_type damping_factor;
599
600 public:
601 ComputeResidualAndSolve_SolveOnly(
602 const AmD<MatrixType>& amd, const btdm_scalar_type_3d_view& d_inv_,
603 const impl_scalar_type_1d_view& W_,
604 const local_ordinal_type& blocksize_requested_,
605 const impl_scalar_type& damping_factor_)
606 : tpetra_values(amd.tpetra_values)
607 , blocksize_requested(blocksize_requested_)
608 , A_x_offsets(amd.A_x_offsets)
609 , A_x_offsets_remote(amd.A_x_offsets_remote)
610 , d_inv(d_inv_)
611 , W(W_)
612 , damping_factor(damping_factor_) {}
613
614 KOKKOS_INLINE_FUNCTION
615 void operator()(const member_type& member) const {
616 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
617 const local_ordinal_type rowidx =
618 member.league_rank() * member.team_size() + member.team_rank();
619 const local_ordinal_type row = rowidx * blocksize;
620 const local_ordinal_type num_vectors = b.extent(1);
621
622 // Get shared allocation for a local copy of x, Ax, and A
623 impl_scalar_type* local_Dinv_residual =
624 reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
625 blocksize * sizeof(impl_scalar_type)));
626
627 if (rowidx >= (local_ordinal_type)d_inv.extent(0)) return;
628
629 magnitude_type norm = 0;
630 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
631 // Compute local_Dinv_residual = D^-1 * local_residual
632 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
633 [&](const local_ordinal_type& k0) {
634 impl_scalar_type val = 0;
635 for (local_ordinal_type k1 = 0; k1 < blocksize;
636 k1++) {
637 val += d_inv(rowidx, k0, k1) * b(row + k1, col);
638 }
639 local_Dinv_residual[k0] = val;
640 });
641
642 magnitude_type colNorm;
643 Kokkos::parallel_reduce(
644 Kokkos::ThreadVectorRange(member, blocksize),
645 [&](const local_ordinal_type& k, magnitude_type& update) {
646 // Compute the change in y (assuming damping_factor == 1) for this
647 // entry.
648 impl_scalar_type y_update = local_Dinv_residual[k];
649 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
650 magnitude_type ydiff =
651 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
652 update += ydiff * ydiff;
653 } else {
654 update += y_update * y_update;
655 }
656 y(row + k, col) = damping_factor * y_update;
657 },
658 colNorm);
659 norm += colNorm;
660 }
661 Kokkos::single(Kokkos::PerThread(member), [&]() { W(rowidx) = norm; });
662 }
663
664 // ComputeResidualAndSolve_SolveOnly::run does the solve for the first
665 // iteration, when the initial guess for y is zero. This means the residual
666 // vector is just b. The kernel applies the inverse diags to b to find y, and
667 // also puts the partial squared update norms (1 per row) into W.
668 void run(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
669 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
670 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
671 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
672 "BlockTriDi::ComputeResidualAndSolve::Run_Y_Zero",
673 ComputeResidualAndSolve0, execution_space);
674
675 y = y_;
676 b = b_;
677
678 const local_ordinal_type blocksize = blocksize_requested;
679 const local_ordinal_type nrows = d_inv.extent(0);
680
681 const local_ordinal_type team_size = 8;
682 const local_ordinal_type vector_size = 8;
683 const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
684 Kokkos::TeamPolicy<execution_space> policy(
685 (nrows + team_size - 1) / team_size, team_size, vector_size);
686 policy.set_scratch_size(0, Kokkos::PerThread(shmem_thread_size));
687 Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::y_zero", policy,
688 *this);
689 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
690 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
691 }
692};
693
694} // namespace Ifpack2::BlockHelperDetails
695
696#endif
size_t size_type
Definition Ifpack2_BlockHelper.hpp:274
node_type::device_type node_device_type
Definition Ifpack2_BlockHelper.hpp:297
KokkosKernels::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition Ifpack2_BlockHelper.hpp:283
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition Ifpack2_BlockHelper.hpp:346