Ifpack2 Templated Preconditioning Package Version 1.0
Loading...
Searching...
No Matches
Ifpack2_BlockTriDiContainer_impl.hpp
1// @HEADER
2// *****************************************************************************
3// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4//
5// Copyright 2009 NTESS and the Ifpack2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11#define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
12
13// #define IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
14// #define IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
15
16#include <Teuchos_Details_MpiTypeTraits.hpp>
17
18#include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19#include <Tpetra_Distributor.hpp>
20#include <Tpetra_BlockMultiVector.hpp>
21
22#include <KokkosKernels_ArithTraits.hpp>
23#include <KokkosBatched_Util.hpp>
24#include <KokkosBatched_Vector.hpp>
25#include <KokkosBatched_Copy_Decl.hpp>
26#include <KokkosBatched_Copy_Impl.hpp>
27#include <KokkosBatched_AddRadial_Decl.hpp>
28#include <KokkosBatched_AddRadial_Impl.hpp>
29#include <KokkosBatched_SetIdentity_Decl.hpp>
30#include <KokkosBatched_SetIdentity_Impl.hpp>
31#include <KokkosBatched_Gemm_Decl.hpp>
32#include <KokkosBatched_Gemm_Serial_Impl.hpp>
33#include <KokkosBatched_Gemm_Team_Impl.hpp>
34#include <KokkosBatched_Gemv_Decl.hpp>
35#include <KokkosBatched_Gemv_Team_Impl.hpp>
36#include <KokkosBatched_Trsm_Decl.hpp>
37#include <KokkosBatched_Trsm_Serial_Impl.hpp>
38#include <KokkosBatched_Trsm_Team_Impl.hpp>
39#include <KokkosBatched_Trsv_Decl.hpp>
40#include <KokkosBatched_Trsv_Serial_Impl.hpp>
41#include <KokkosBatched_Trsv_Team_Impl.hpp>
42#include <KokkosBatched_LU_Decl.hpp>
43#include <KokkosBatched_LU_Serial_Impl.hpp>
44#include <KokkosBatched_LU_Team_Impl.hpp>
45
46#include <KokkosBlas1_nrm1.hpp>
47#include <KokkosBlas1_nrm2.hpp>
48
49#include <memory>
50
51#include "Ifpack2_BlockHelper.hpp"
52#include "Ifpack2_BlockComputeResidualVector.hpp"
53#include "Ifpack2_BlockComputeResidualAndSolve.hpp"
54
55// need to interface this into cmake variable (or only use this flag when it is necessary)
56// #define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
57// #undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
58#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
59#include "cuda_profiler_api.h"
60#endif
61
62// I am not 100% sure about the mpi 3 on cuda
63#if MPI_VERSION >= 3
64#define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
65#endif
66
67// ::: Experiments :::
68// define either pinned memory or cudamemory for mpi
69// if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
70// if defined, this use pinned memory instead of device pointer
71// by default, we enable pinned memory
72#define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
73// #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
74
75// if defined, all views are allocated on cuda space intead of cuda uvm space
76#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
77
78// if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
79#if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
80#define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
81#endif
82
83// if defined, it uses multiple execution spaces
84#define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
85
86namespace Ifpack2 {
87
88namespace BlockTriDiContainerDetails {
89
90namespace KB = KokkosBatched;
91
95using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
96
97template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
98using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
99 MemoryTraitsType::is_random_access |
100 flag>;
101
102template <typename ViewType>
103using Unmanaged = Kokkos::View<typename ViewType::data_type,
104 typename ViewType::array_layout,
105 typename ViewType::device_type,
106 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
107template <typename ViewType>
108using Atomic = Kokkos::View<typename ViewType::data_type,
109 typename ViewType::array_layout,
110 typename ViewType::device_type,
111 MemoryTraits<typename ViewType::memory_traits, Kokkos::Atomic>>;
112template <typename ViewType>
113using Const = Kokkos::View<typename ViewType::const_data_type,
114 typename ViewType::array_layout,
115 typename ViewType::device_type,
116 typename ViewType::memory_traits>;
117template <typename ViewType>
118using ConstUnmanaged = Const<Unmanaged<ViewType>>;
119
120template <typename ViewType>
121using AtomicUnmanaged = Atomic<Unmanaged<ViewType>>;
122
123template <typename ViewType>
124using Unmanaged = Kokkos::View<typename ViewType::data_type,
125 typename ViewType::array_layout,
126 typename ViewType::device_type,
127 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
128
129template <typename ViewType>
130using Scratch = Kokkos::View<typename ViewType::data_type,
131 typename ViewType::array_layout,
132 typename ViewType::execution_space::scratch_memory_space,
133 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
134
138template <typename T>
140 typedef T type;
141};
142#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
143template <>
144struct BlockTridiagScalarType<double> {
145 typedef float type;
146};
147// template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
148#endif
149
150#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
151#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
152 KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
153
154#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
155 { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStop()); }
156#else
158#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
159#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
160#endif
161
165template <typename MatrixType>
166typename Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type>
167createBlockCrsTpetraImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
168 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
170 using tpetra_map_type = typename impl_type::tpetra_map_type;
171 using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
172 using tpetra_import_type = typename impl_type::tpetra_import_type;
173 using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
174 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
175
176 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
177 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
178
179 bool hasBlockCrsMatrix = !A_bcrs.is_null();
180
181 // This is OK here to use the graph of the A_crs matrix and a block size of 1
182 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
183
184 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
185 const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
186 const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap(), blocksize)));
187 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
188 return Teuchos::rcp(new tpetra_import_type(src, tgt));
189}
190
191// Partial replacement for forward-mode MultiVector::doImport.
192// Permits overlapped communication and computation, but also supports sync'ed.
193// I'm finding that overlapped comm/comp can give quite poor performance on some
194// platforms, so we can't just use it straightforwardly always.
195
196template <typename MatrixType>
197struct AsyncableImport {
198 public:
200
201 private:
205#if !defined(HAVE_IFPACK2_MPI)
206 typedef int MPI_Request;
207 typedef int MPI_Comm;
208#endif
211 using scalar_type = typename impl_type::scalar_type;
212
213 static int isend(const MPI_Comm comm, const char *buf, int count, int dest, int tag, MPI_Request *ireq) {
214#ifdef HAVE_IFPACK2_MPI
215 MPI_Request ureq;
216 int ret = MPI_Isend(const_cast<char *>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
217 if (ireq == NULL) MPI_Request_free(&ureq);
218 return ret;
219#else
220 return 0;
221#endif
222 }
223
224 static int irecv(const MPI_Comm comm, char *buf, int count, int src, int tag, MPI_Request *ireq) {
225#ifdef HAVE_IFPACK2_MPI
226 MPI_Request ureq;
227 int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
228 if (ireq == NULL) MPI_Request_free(&ureq);
229 return ret;
230#else
231 return 0;
232#endif
233 }
234
235 static int waitany(int count, MPI_Request *reqs, int *index) {
236#ifdef HAVE_IFPACK2_MPI
237 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
238#else
239 return 0;
240#endif
241 }
242
243 static int waitall(int count, MPI_Request *reqs) {
244#ifdef HAVE_IFPACK2_MPI
245 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
246#else
247 return 0;
248#endif
249 }
250
251 public:
252 using tpetra_map_type = typename impl_type::tpetra_map_type;
253 using tpetra_import_type = typename impl_type::tpetra_import_type;
254
255 using local_ordinal_type = typename impl_type::local_ordinal_type;
256 using global_ordinal_type = typename impl_type::global_ordinal_type;
257 using size_type = typename impl_type::size_type;
258 using impl_scalar_type = typename impl_type::impl_scalar_type;
259
260 using int_1d_view_host = Kokkos::View<int *, Kokkos::HostSpace>;
261 using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type *, Kokkos::HostSpace>;
262
263 using execution_space = typename impl_type::execution_space;
264 using memory_space = typename impl_type::memory_space;
265 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
266 using size_type_1d_view = typename impl_type::size_type_1d_view;
267 using size_type_1d_view_host = Kokkos::View<size_type *, Kokkos::HostSpace>;
268
269#if defined(KOKKOS_ENABLE_CUDA)
270 using impl_scalar_type_1d_view =
271 typename std::conditional<std::is_same<execution_space, Kokkos::Cuda>::value,
272#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
273 Kokkos::View<impl_scalar_type *, Kokkos::CudaHostPinnedSpace>,
274#elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
275 Kokkos::View<impl_scalar_type *, Kokkos::CudaSpace>,
276#else // no experimental macros are defined
277 typename impl_type::impl_scalar_type_1d_view,
278#endif
279 typename impl_type::impl_scalar_type_1d_view>::type;
280#else
281 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
282#endif
283 using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type *, Kokkos::HostSpace>;
284 using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
285 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
286
287#ifdef HAVE_IFPACK2_MPI
288 MPI_Comm comm;
289#endif
290
291 impl_scalar_type_2d_view_tpetra remote_multivector;
292 local_ordinal_type blocksize;
293
294 template <typename T>
295 struct SendRecvPair {
296 T send, recv;
297 };
298
299 // (s)end and (r)eceive data:
300 SendRecvPair<int_1d_view_host> pids; // mpi ranks
301 SendRecvPair<std::vector<MPI_Request>> reqs; // MPI_Request is pointer, cannot use kokkos view
302 SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
303 SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
304 SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
305 SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
306 SendRecvPair<impl_scalar_type_1d_view_host> buffer_host; // data buffer
307
308 local_ordinal_type_1d_view dm2cm; // permutation
309
310#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
311 using exec_instance_1d_std_vector = std::vector<execution_space>;
312 exec_instance_1d_std_vector exec_instances;
313#endif
314
315 // for cuda
316 public:
317 void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
318 const size_type_1d_view &offs) {
319 // wrap lens to kokkos view and deep copy to device
320 Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.getRawPtr()), lens.size());
321 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
322
323 // exclusive scan
324 const Kokkos::RangePolicy<execution_space> policy(0, offs.extent(0));
325 const local_ordinal_type lens_size = lens_device.extent(0);
326 Kokkos::parallel_scan(
327 "AsyncableImport::RangePolicy::setOffsetValues",
328 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
329 if (final)
330 offs(i) = update;
331 update += (i < lens_size ? lens_device[i] : 0);
332 });
333 }
334
335 void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
336 const size_type_1d_view_host &offs) {
337 // wrap lens to kokkos view and deep copy to device
338 Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.getRawPtr()), lens.size());
339 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
340
341 // exclusive scan
342 offs(0) = 0;
343 for (local_ordinal_type i = 1, iend = offs.extent(0); i < iend; ++i) {
344 offs(i) = offs(i - 1) + lens[i - 1];
345 }
346 }
347
348 private:
349 void createMpiRequests(const tpetra_import_type &import) {
350 Tpetra::Distributor &distributor = import.getDistributor();
351
352 // copy pids from distributor
353 const auto pids_from = distributor.getProcsFrom();
354 pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
355 memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int) * pids.recv.extent(0));
356
357 const auto pids_to = distributor.getProcsTo();
358 pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
359 memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int) * pids.send.extent(0));
360
361 // mpi requests
362 reqs.recv.resize(pids.recv.extent(0));
363 memset(reqs.recv.data(), 0, reqs.recv.size() * sizeof(MPI_Request));
364 reqs.send.resize(pids.send.extent(0));
365 memset(reqs.send.data(), 0, reqs.send.size() * sizeof(MPI_Request));
366
367 // construct offsets
368#if 0
369 const auto lengths_to = distributor.getLengthsTo();
370 offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
371
372 const auto lengths_from = distributor.getLengthsFrom();
373 offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
374
375 setOffsetValues(lengths_to, offset.send);
376 offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
377
378 setOffsetValues(lengths_from, offset.recv);
379 offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
380#else
381 const auto lengths_to = distributor.getLengthsTo();
382 offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
383
384 const auto lengths_from = distributor.getLengthsFrom();
385 offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
386
387 setOffsetValuesHost(lengths_to, offset_host.send);
388 // offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
389
390 setOffsetValuesHost(lengths_from, offset_host.recv);
391 // offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
392#endif
393 }
394
395 void createSendRecvIDs(const tpetra_import_type &import) {
396 // For each remote PID, the list of LIDs to receive.
397 const auto remote_lids = import.getRemoteLIDs();
398 const local_ordinal_type_1d_view_host
399 remote_lids_view_host(const_cast<local_ordinal_type *>(remote_lids.getRawPtr()), remote_lids.size());
400 lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
401 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
402
403 // For each export PID, the list of LIDs to send.
404 auto epids = import.getExportPIDs();
405 auto elids = import.getExportLIDs();
406 TEUCHOS_ASSERT(epids.size() == elids.size());
407 lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
408 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
409
410 // naive search (not sure if pids or epids are sorted)
411 for (local_ordinal_type cnt = 0, i = 0, iend = pids.send.extent(0); i < iend; ++i) {
412 const auto pid_send_value = pids.send[i];
413 for (local_ordinal_type j = 0, jend = epids.size(); j < jend; ++j)
414 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
415 TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i + 1]);
416 }
417 Kokkos::deep_copy(lids.send, lids_send_host);
418 }
419
420 void createExecutionSpaceInstances() {
421#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
422 // The following line creates 8 streams:
423 exec_instances =
424 Kokkos::Experimental::partition_space(execution_space(), std::vector<int>(8, 1));
425#endif
426 }
427
428 public:
429 // for cuda, all tag types are public
430 struct ToBuffer {};
431 struct ToMultiVector {};
432
433 AsyncableImport(const Teuchos::RCP<const tpetra_map_type> &src_map,
434 const Teuchos::RCP<const tpetra_map_type> &tgt_map,
435 const local_ordinal_type blocksize_,
436 const local_ordinal_type_1d_view dm2cm_) {
437 blocksize = blocksize_;
438 dm2cm = dm2cm_;
439
440#ifdef HAVE_IFPACK2_MPI
441 comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
442#endif
443 const tpetra_import_type import(src_map, tgt_map);
444
445 createMpiRequests(import);
446 createSendRecvIDs(import);
447 createExecutionSpaceInstances();
448 }
449
450 void createDataBuffer(const local_ordinal_type &num_vectors) {
451 const size_type extent_0 = lids.recv.extent(0) * blocksize;
452 const size_type extent_1 = num_vectors;
453 if (remote_multivector.extent(0) == extent_0 &&
454 remote_multivector.extent(1) == extent_1) {
455 // skip
456 } else {
457 remote_multivector =
458 impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
459
460 const auto send_buffer_size = offset_host.send[offset_host.send.extent(0) - 1] * blocksize * num_vectors;
461 const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0) - 1] * blocksize * num_vectors;
462
463 buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
464 buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
465
466 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
467 buffer_host.send = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer send"), send_buffer_size);
468 buffer_host.recv = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer recv"), recv_buffer_size);
469 }
470 }
471 }
472
473 void cancel() {
474#ifdef HAVE_IFPACK2_MPI
475 waitall(reqs.recv.size(), reqs.recv.data());
476 waitall(reqs.send.size(), reqs.send.data());
477#endif
478 }
479
480 // ======================================================================
481 // Async version using execution space instances
482 // ======================================================================
483
484#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
485 template <typename PackTag>
486 static void copy(const local_ordinal_type_1d_view &lids_,
487 const impl_scalar_type_1d_view &buffer_,
488 const local_ordinal_type ibeg_,
489 const local_ordinal_type iend_,
490 const impl_scalar_type_2d_view_tpetra &multivector_,
491 const local_ordinal_type blocksize_,
492 const execution_space &exec_instance_) {
493 const local_ordinal_type num_vectors = multivector_.extent(1);
494 const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
495 const local_ordinal_type idiff = iend_ - ibeg_;
496 const auto abase = buffer_.data() + mv_blocksize * ibeg_;
497
498 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
499 local_ordinal_type vector_size(0);
500 if (blocksize_ <= 4)
501 vector_size = 4;
502 else if (blocksize_ <= 8)
503 vector_size = 8;
504 else if (blocksize_ <= 16)
505 vector_size = 16;
506 else
507 vector_size = 32;
508
509 const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
510 const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
511 Kokkos::parallel_for( //"AsyncableImport::TeamPolicy::copyViaCudaStream",
512 Kokkos::Experimental::require(policy, work_item_property),
513 KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
514 const local_ordinal_type i = member.league_rank();
515 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](const local_ordinal_type &j) {
516 auto aptr = abase + blocksize_ * (i + idiff * j);
517 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
518 if (std::is_same<PackTag, ToBuffer>::value)
519 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
520 aptr[k] = bptr[k];
521 });
522 else
523 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
524 bptr[k] = aptr[k];
525 });
526 });
527 });
528 }
529
530 void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
531 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
532
533#ifdef HAVE_IFPACK2_MPI
534 // constants and reallocate data buffers if necessary
535 const local_ordinal_type num_vectors = mv.extent(1);
536 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
537
538 // 0. post receive async
539 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
540 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
541 irecv(comm,
542 reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
543 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
544 pids.recv[i],
545 42,
546 &reqs.recv[i]);
547 } else {
548 irecv(comm,
549 reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
550 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
551 pids.recv[i],
552 42,
553 &reqs.recv[i]);
554 }
555 }
556
558 execution_space().fence();
559
560 // 1. async memcpy
561 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
562 // 1.0. enqueue pack buffer
563 if (i < 8) exec_instances[i % 8].fence();
564 copy<ToBuffer>(lids.send, buffer.send,
565 offset_host.send(i), offset_host.send(i + 1),
566 mv, blocksize,
567 // execution_space());
568 exec_instances[i % 8]);
569 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
570 // if (i<8) exec_instances[i%8].fence();
571 const local_ordinal_type num_vectors = mv.extent(1);
572 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
573
574 Kokkos::deep_copy(exec_instances[i % 8],
575 Kokkos::subview(buffer_host.send,
576 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
577 offset_host.send(i) * mv_blocksize,
578 offset_host.send(i + 1) * mv_blocksize)),
579 Kokkos::subview(buffer.send,
580 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
581 offset_host.send(i) * mv_blocksize,
582 offset_host.send(i + 1) * mv_blocksize)));
583 }
584 }
586 // execution_space().fence();
587 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
588 // 1.1. sync the stream and isend
589 if (i < 8) exec_instances[i % 8].fence();
590 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
591 isend(comm,
592 reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
593 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
594 pids.send[i],
595 42,
596 &reqs.send[i]);
597 } else {
598 isend(comm,
599 reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
600 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
601 pids.send[i],
602 42,
603 &reqs.send[i]);
604 }
605 }
606
607 // 2. poke communication
608 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
609 int flag;
610 MPI_Status stat;
611 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
612 }
613#endif // HAVE_IFPACK2_MPI
614 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
615 }
616
617 void syncRecvVar1() {
618 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
619#ifdef HAVE_IFPACK2_MPI
620 // 0. wait for receive async.
621 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.recv.extent(0)); ++i) {
622 local_ordinal_type idx = i;
623
624 // 0.0. wait any
625 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
626
627 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
628 const local_ordinal_type num_vectors = remote_multivector.extent(1);
629 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
630
631 Kokkos::deep_copy(
632 Kokkos::subview(buffer.recv,
633 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
634 offset_host.recv(idx) * mv_blocksize,
635 offset_host.recv(idx + 1) * mv_blocksize)),
636 Kokkos::subview(buffer_host.recv,
637 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
638 offset_host.recv(idx) * mv_blocksize,
639 offset_host.recv(idx + 1) * mv_blocksize)));
640 }
641
642 // 0.1. unpack data after data is moved into a device
643 copy<ToMultiVector>(lids.recv, buffer.recv,
644 offset_host.recv(idx), offset_host.recv(idx + 1),
645 remote_multivector, blocksize,
646 exec_instances[idx % 8]);
647 }
648
649 // 1. fire up all cuda events
650 Kokkos::fence();
651
652 // 2. cleanup all open comm
653 waitall(reqs.send.size(), reqs.send.data());
654#endif // HAVE_IFPACK2_MPI
655 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
656 }
657#endif // defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
658
659 // ======================================================================
660 // Generic version without using execution space instances
661 // - only difference between device and host architecture is on using team
662 // or range policies.
663 // ======================================================================
664 template <typename PackTag>
665 static void copy(const local_ordinal_type_1d_view &lids_,
666 const impl_scalar_type_1d_view &buffer_,
667 const local_ordinal_type &ibeg_,
668 const local_ordinal_type &iend_,
669 const impl_scalar_type_2d_view_tpetra &multivector_,
670 const local_ordinal_type blocksize_) {
671 const local_ordinal_type num_vectors = multivector_.extent(1);
672 const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
673 const local_ordinal_type idiff = iend_ - ibeg_;
674 const auto abase = buffer_.data() + mv_blocksize * ibeg_;
675 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
676 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
677 local_ordinal_type vector_size(0);
678 if (blocksize_ <= 4)
679 vector_size = 4;
680 else if (blocksize_ <= 8)
681 vector_size = 8;
682 else if (blocksize_ <= 16)
683 vector_size = 16;
684 else
685 vector_size = 32;
686 const team_policy_type policy(idiff, 1, vector_size);
687 Kokkos::parallel_for(
688 "AsyncableImport::TeamPolicy::copy",
689 policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
690 const local_ordinal_type i = member.league_rank();
691 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](const local_ordinal_type &j) {
692 auto aptr = abase + blocksize_ * (i + idiff * j);
693 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
694 if (std::is_same<PackTag, ToBuffer>::value)
695 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
696 aptr[k] = bptr[k];
697 });
698 else
699 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
700 bptr[k] = aptr[k];
701 });
702 });
703 });
704 } else {
705 const Kokkos::RangePolicy<execution_space> policy(0, idiff * num_vectors);
706 Kokkos::parallel_for(
707 "AsyncableImport::RangePolicy::copy",
708 policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
709 const local_ordinal_type i = ij % idiff;
710 const local_ordinal_type j = ij / idiff;
711 auto aptr = abase + blocksize_ * (i + idiff * j);
712 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
713 auto from = std::is_same<PackTag, ToBuffer>::value ? bptr : aptr;
714 auto to = std::is_same<PackTag, ToBuffer>::value ? aptr : bptr;
715 memcpy(to, from, sizeof(impl_scalar_type) * blocksize_);
716 });
717 }
718 }
719
723 void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
724 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
725
726#ifdef HAVE_IFPACK2_MPI
727 // constants and reallocate data buffers if necessary
728 const local_ordinal_type num_vectors = mv.extent(1);
729 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
730
731 // receive async
732 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
733 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
734 irecv(comm,
735 reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
736 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
737 pids.recv[i],
738 42,
739 &reqs.recv[i]);
740 } else {
741 irecv(comm,
742 reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
743 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
744 pids.recv[i],
745 42,
746 &reqs.recv[i]);
747 }
748 }
749
750 // send async
751 for (local_ordinal_type i = 0, iend = pids.send.extent(0); i < iend; ++i) {
752 copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i + 1),
753 mv, blocksize);
754 Kokkos::fence();
755 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
756 isend(comm,
757 reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
758 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
759 pids.send[i],
760 42,
761 &reqs.send[i]);
762 } else {
763 Kokkos::deep_copy(
764 Kokkos::subview(buffer_host.send,
765 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
766 offset_host.send(i) * mv_blocksize,
767 offset_host.send(i + 1) * mv_blocksize)),
768 Kokkos::subview(buffer.send,
769 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
770 offset_host.send(i) * mv_blocksize,
771 offset_host.send(i + 1) * mv_blocksize)));
772 isend(comm,
773 reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
774 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
775 pids.send[i],
776 42,
777 &reqs.send[i]);
778 }
779 }
780
781 // I find that issuing an Iprobe seems to nudge some MPIs into action,
782 // which helps with overlapped comm/comp performance.
783 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
784 int flag;
785 MPI_Status stat;
786 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
787 }
788#endif
789 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
790 }
791
792 void syncRecvVar0() {
793 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
794#ifdef HAVE_IFPACK2_MPI
795 // receive async.
796 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
797 local_ordinal_type idx = i;
798 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
799 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
800 const local_ordinal_type num_vectors = remote_multivector.extent(1);
801 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
802 Kokkos::deep_copy(
803 Kokkos::subview(buffer.recv,
804 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
805 offset_host.recv(idx) * mv_blocksize,
806 offset_host.recv(idx + 1) * mv_blocksize)),
807 Kokkos::subview(buffer_host.recv,
808 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
809 offset_host.recv(idx) * mv_blocksize,
810 offset_host.recv(idx + 1) * mv_blocksize)));
811 }
812 copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx + 1),
813 remote_multivector, blocksize);
814 }
815 // wait on the sends to match all Isends with a cleanup operation.
816 waitall(reqs.send.size(), reqs.send.data());
817#endif
818 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
819 }
820
824 void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
825#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
826#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
827 asyncSendRecvVar1(mv);
828#else
829 asyncSendRecvVar0(mv);
830#endif
831#else
832 asyncSendRecvVar0(mv);
833#endif
834 }
835 void syncRecv() {
836#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
837#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
838 syncRecvVar1();
839#else
840 syncRecvVar0();
841#endif
842#else
843 syncRecvVar0();
844#endif
845 }
846
847 void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
848 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
849 asyncSendRecv(mv);
850 syncRecv();
851 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
852 }
853
854 impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
855};
856
857template <typename ViewType1, typename ViewType2>
858struct are_same_struct {
859 ViewType1 keys1;
860 ViewType2 keys2;
861
862 are_same_struct(ViewType1 keys1_, ViewType2 keys2_)
863 : keys1(keys1_)
864 , keys2(keys2_) {}
865 KOKKOS_INLINE_FUNCTION
866 void operator()(int i, unsigned int &count) const {
867 if (keys1(i) != keys2(i)) count++;
868 }
869};
870
871template <typename ViewType1, typename ViewType2>
872bool are_same(ViewType1 keys1, ViewType2 keys2) {
873 unsigned int are_same_ = 0;
874
875 Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
876 are_same_struct(keys1, keys2),
877 are_same_);
878 return are_same_ == 0;
879}
880
884template <typename MatrixType>
885Teuchos::RCP<AsyncableImport<MatrixType>>
886createBlockCrsAsyncImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
887 IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter", createBlockCrsAsyncImporter);
889 using tpetra_map_type = typename impl_type::tpetra_map_type;
890 using local_ordinal_type = typename impl_type::local_ordinal_type;
891 using global_ordinal_type = typename impl_type::global_ordinal_type;
892 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
893 using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
894 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
895 using global_indices_array_device_type = Kokkos::View<const global_ordinal_type *, typename tpetra_map_type::device_type>;
896
897 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
898 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
899
900 bool hasBlockCrsMatrix = !A_bcrs.is_null();
901
902 // This is OK here to use the graph of the A_crs matrix and a block size of 1
903 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
904
905 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
906 const auto domain_map = g.getDomainMap();
907 const auto column_map = g.getColMap();
908
909 std::vector<global_ordinal_type> gids;
910
911 Kokkos::Subview<global_indices_array_device_type, std::pair<int, int>> column_map_global_iD_last;
912
913 bool separate_remotes = true, found_first = false, need_owned_permutation = false;
914 {
915 IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
916
917 global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
918 global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
919
920 if (are_same(domain_map_global_iD, column_map_global_iD)) {
921 // this should be the most likely path
922 separate_remotes = true;
923 need_owned_permutation = false;
924
925 column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
926 std::pair<int, int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
927 } else {
928 // This loop is relatively expensive
929 for (size_t i = 0; i < column_map->getLocalNumElements(); ++i) {
930 const global_ordinal_type gid = column_map->getGlobalElement(i);
931 if (!domain_map->isNodeGlobalElement(gid)) {
932 found_first = true;
933 gids.push_back(gid);
934 } else if (found_first) {
935 separate_remotes = false;
936 break;
937 }
938 if (!found_first && !need_owned_permutation &&
939 domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
940 // The owned part of the domain and column maps are different
941 // orderings. We *could* do a super efficient impl of this case in the
942 // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
943 // really, if a caller cares about speed, they wouldn't make different
944 // local permutations like this. So we punt on the best impl and go for
945 // a pretty good one: the permutation is done in place in
946 // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
947 // is the presumably worse memory access pattern of the input vector.
948 need_owned_permutation = true;
949 }
950 }
951 }
952 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
953 }
954
955 if (separate_remotes) {
956 IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
957 const auto invalid = Teuchos::OrdinalTraits<global_ordinal_type>::invalid();
958 const auto parsimonious_col_map = need_owned_permutation ? Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())) : Teuchos::rcp(new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
959 if (parsimonious_col_map->getGlobalNumElements() > 0) {
960 // make the importer only if needed.
961 local_ordinal_type_1d_view dm2cm;
962 if (need_owned_permutation) {
963 dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getLocalNumElements());
964 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
965 for (size_t i = 0; i < domain_map->getLocalNumElements(); ++i)
966 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
967 Kokkos::deep_copy(dm2cm, dm2cm_host);
968 }
969 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
970 return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
971 }
972 }
973 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
974 return Teuchos::null;
975}
976
977template <typename local_ordinal_type>
978local_ordinal_type costTRSM(const local_ordinal_type block_size) {
979 return block_size * block_size;
980}
981
982template <typename local_ordinal_type>
983local_ordinal_type costGEMV(const local_ordinal_type block_size) {
984 return 2 * block_size * block_size;
985}
986
987template <typename local_ordinal_type>
988local_ordinal_type costTriDiagSolve(const local_ordinal_type subline_length, const local_ordinal_type block_size) {
989 return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length - 1) * costGEMV(block_size);
990}
991
992template <typename local_ordinal_type>
993local_ordinal_type costSolveSchur(const local_ordinal_type num_parts,
994 const local_ordinal_type num_teams,
995 const local_ordinal_type line_length,
996 const local_ordinal_type block_size,
997 const local_ordinal_type n_subparts_per_part) {
998 const local_ordinal_type subline_length = ceil(double(line_length - (n_subparts_per_part - 1) * 2) / n_subparts_per_part);
999 if (subline_length < 1) {
1000 return INT_MAX;
1001 }
1002
1003 const local_ordinal_type p_n_lines = ceil(double(num_parts) / num_teams);
1004 const local_ordinal_type p_n_sublines = ceil(double(n_subparts_per_part) * num_parts / num_teams);
1005 const local_ordinal_type p_n_sublines_2 = ceil(double(n_subparts_per_part - 1) * num_parts / num_teams);
1006
1007 const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1008 const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part - 1) * 2, block_size);
1009 const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length, block_size);
1010 const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1011
1012 if (n_subparts_per_part == 1) {
1013 return p_costApplyAinv;
1014 }
1015 return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1016}
1017
1018template <typename local_ordinal_type>
1019local_ordinal_type getAutomaticNSubparts(const local_ordinal_type num_parts,
1020 const local_ordinal_type num_teams,
1021 const local_ordinal_type line_length,
1022 const local_ordinal_type block_size) {
1023 // BMK: replaced theoretical model with empirical model
1024 // This is a linear regression based on data from a grid search.
1025 // The independent terms in the regression are:
1026 // - "parallelism surplus" - smaller when problem has enough lines to saturate GPU, larger otherwise
1027 // - log2 of the line length
1028 // - block size
1029 double parallelismSurplus = Kokkos::sqrt((double)num_teams / num_parts);
1030 double logLineLength = Kokkos::log2((double)line_length);
1031 (void)logLineLength;
1032 // Directly predict with linear model
1033#if defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU)
1034 // MI300-specific data
1035 double modeled = -9.2312 + 4.6946 * parallelismSurplus + 0.4095 * block_size + 0.966 * logLineLength;
1036 // Do not split lines if there is plenty of parallelism
1037 if (parallelismSurplus < 0.3)
1038 modeled = 1;
1039#elif defined(KOKKOS_ARCH_HOPPER) || defined(KOKKOS_ARCH_BLACKWELL)
1040 // Based on H100 data
1041 double modeled = -9.6053 + 4.7477 * parallelismSurplus + 0.2338 * block_size + 1.0794 * logLineLength;
1042 // On H100, performance degrades rapidly if small lines are split too many times
1043 double maxSplit = (double)line_length / 8;
1044 if (modeled > maxSplit)
1045 modeled = maxSplit;
1046#elif defined(KOKKOS_ENABLE_CUDA)
1047 // Based on V100 data, line splitting is profitable in fewer cases
1048 // (only when there are few, long lines)
1049 double modeled = 1;
1050 if (parallelismSurplus > 1 && line_length > 64)
1051 modeled = 4;
1052#elif defined(KOKKOS_ENABLE_HIP)
1053 // Based on MI250X data
1054 double modeled = -8.6214 + 7.3468 * parallelismSurplus + 0.3596 * block_size + 0.6673 * logLineLength;
1055#else
1056 // GPUs other than CUDA or HIP: default to simple model that works for V100
1057 double modeled = 1;
1058 if (parallelismSurplus > 1 && line_length > 64)
1059 modeled = 4;
1060#endif
1061
1062 // Round to nearest integer
1063 local_ordinal_type n_subparts_per_part = 0.5 + modeled;
1064 // Do not split lines if there is plenty of parallelism available
1065 if (parallelismSurplus < 0.3)
1066 n_subparts_per_part = 1;
1067 // Clamp the result to valid range
1068 // Criteria for valid n_subparts_per_part (where connection_length is 2 for wide separators)
1069 // line_length >= n_subparts_per_part + (n_subparts_per_part - 1) * connection_length
1070 // Equivalently:
1071 // line_length >= n_subparts_per_part + n_subparts_per_part * 2 - 2
1072 // line_length >= 3 * n_subparts_per_part - 2
1073 local_ordinal_type min_subparts_per_part = 1;
1074 local_ordinal_type max_subparts_per_part = (line_length + 2) / 3;
1075 // Limit memory usage from too many sublines
1076 if (max_subparts_per_part > 16)
1077 max_subparts_per_part = 16;
1078 if (n_subparts_per_part < min_subparts_per_part)
1079 n_subparts_per_part = min_subparts_per_part;
1080 if (n_subparts_per_part > max_subparts_per_part)
1081 n_subparts_per_part = max_subparts_per_part;
1082 return n_subparts_per_part;
1083}
1084
1085template <typename ArgActiveExecutionMemorySpace>
1086struct SolveTridiagsDefaultModeAndAlgo;
1087
1091template <typename MatrixType>
1092BlockHelperDetails::PartInterface<MatrixType>
1093createPartInterface(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1094 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1095 const Teuchos::Array<Teuchos::Array<typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type>> &partitions,
1096 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1097 IFPACK2_BLOCKHELPER_TIMER("createPartInterface", createPartInterface);
1099 using local_ordinal_type = typename impl_type::local_ordinal_type;
1100 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1101 using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
1102 using size_type = typename impl_type::size_type;
1103
1104 auto bA = Teuchos::rcp_dynamic_cast<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1105
1106 TEUCHOS_ASSERT(!bA.is_null() || G->getLocalNumRows() != 0);
1107 const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1108 constexpr int vector_length = impl_type::vector_length;
1109 constexpr int internal_vector_length = impl_type::internal_vector_length;
1110
1111 const auto comm = A->getRowMap()->getComm();
1112
1113 BlockHelperDetails::PartInterface<MatrixType> interf;
1114
1115 const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1116 const bool jacobi = partitions.size() == 0 || partitions.size() == A_n_lclrows;
1117 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1118
1119 typedef std::pair<local_ordinal_type, local_ordinal_type> size_idx_pair_type;
1120 std::vector<size_idx_pair_type> partsz(nparts);
1121
1122 if (!jacobi) {
1123 for (local_ordinal_type i = 0; i < nparts; ++i)
1124 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1125 std::sort(partsz.begin(), partsz.end(),
1126 [](const size_idx_pair_type &x, const size_idx_pair_type &y) {
1127 return x.first > y.first;
1128 });
1129 }
1130
1131 local_ordinal_type n_subparts_per_part;
1132 if (jacobi) {
1133 n_subparts_per_part = 1;
1134 } else {
1135 if (n_subparts_per_part_in == -1) {
1136 // If the number of subparts is set to -1, the user let the algorithm
1137 // decides the value automatically
1138 using execution_space = typename impl_type::execution_space;
1139
1140 // Line splitting only benefits GPUs
1141 if constexpr (impl_type::node_type::is_gpu) {
1142 const int line_length = partsz[0].first;
1143
1144 const local_ordinal_type team_size =
1145 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1146 recommended_team_size(blocksize, vector_length, internal_vector_length);
1147
1148 const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1149 n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1150#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1151 printf("Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1152#endif
1153 } else {
1154 n_subparts_per_part = 1;
1155#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1156 printf("Automatically chosen n_subparts_per_part = 1 for CPU backend\n");
1157#endif
1158 }
1159 } else {
1160 n_subparts_per_part = n_subparts_per_part_in;
1161 }
1162 }
1163
1164 // Total number of sub lines:
1165 const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1166 // Total number of sub lines + the Schur complement blocks.
1167 // For a given live 2 sub lines implies one Schur complement, 3 sub lines implies two Schur complements etc.
1168 const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part - 1);
1169
1170#if defined(BLOCKTRIDICONTAINER_DEBUG)
1171 local_ordinal_type nrows = 0;
1172 if (jacobi)
1173 nrows = nparts;
1174 else
1175 for (local_ordinal_type i = 0; i < nparts; ++i) nrows += partitions[i].size();
1176
1177 TEUCHOS_TEST_FOR_EXCEPT_MSG(nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1178 << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
1179#endif
1180
1181 // permutation vector
1182 std::vector<local_ordinal_type> p;
1183 if (jacobi) {
1184 interf.max_partsz = 1;
1185 interf.max_subpartsz = 0;
1186 interf.n_subparts_per_part = 1;
1187 interf.nparts = nparts;
1188 } else {
1189 // reorder parts to maximize simd packing efficiency
1190 p.resize(nparts);
1191
1192 for (local_ordinal_type i = 0; i < nparts; ++i)
1193 p[i] = partsz[i].second;
1194
1195 interf.max_partsz = partsz[0].first;
1196
1197 constexpr local_ordinal_type connection_length = 2;
1198 const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1199 const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1200
1201 interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1202 interf.n_subparts_per_part = n_subparts_per_part;
1203 interf.nparts = nparts;
1204 }
1205
1206 // allocate parts
1207 interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1208 interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1209 interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1210 interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1211 interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1212
1213 interf.part2rowidx0_sub = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1214 interf.part2packrowidx0_sub = local_ordinal_type_2d_view(do_not_initialize_tag("part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1215 interf.rowidx2part_sub = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1216
1217 interf.partptr_sub = local_ordinal_type_2d_view(do_not_initialize_tag("partptr_sub"), n_sub_parts_and_schur, 2);
1218
1219 // mirror to host and compute on host execution space
1220 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1221 const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1222
1223 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1224 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1225 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1226 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1227
1228 const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1229 const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1230 const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1231
1232 // Determine parts.
1233 interf.row_contiguous = true;
1234 partptr(0) = 0;
1235 part2rowidx0(0) = 0;
1236 part2packrowidx0(0) = 0;
1237 local_ordinal_type pack_nrows = 0;
1238 local_ordinal_type pack_nrows_sub = 0;
1239 if (jacobi) {
1240 IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)", Jacobi);
1241 // Jacobi (all lines have length 1) means that A_n_lclrows == nparts,
1242 // so the mapping between parts and rows is trivial.
1243 // Note: we can leave interf.row_contiguous = true, since for all i: lclrow(i) == i
1244 for (local_ordinal_type i = 0; i <= nparts; ++i) {
1245 part2rowidx0(i) = i;
1246 partptr(i) = i;
1247 }
1248 for (local_ordinal_type i = 0; i < nparts; ++i) {
1249 rowidx2part(i) = i;
1250 lclrow(i) = i;
1251 }
1252 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1253 // assume No overlap.
1254 if (ip % vector_length == 0) pack_nrows = 1;
1255 part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1256 }
1257 part2rowidx0_sub(0) = 0;
1258 partptr_sub(0, 0) = 0;
1259
1260 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1261 constexpr local_ordinal_type ipnrows = 1;
1262 const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1263
1264 TEUCHOS_TEST_FOR_EXCEPTION(full_line_length != ipnrows, std::logic_error,
1265 "In the part " << ip);
1266
1267 constexpr local_ordinal_type connection_length = 2;
1268
1269 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1270 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
1271 "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1272
1273 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1274 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1275
1276 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1277
1278 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1279 const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1280 const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1281 if (local_sub_ip != n_subparts_per_part - 1) {
1282 if (local_sub_ip != 0) {
1283 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1284 } else if (ip != 0) {
1285 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1286 }
1287 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1288 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1289 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1290
1291 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1292 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1293
1294#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1295 printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1296 printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1297#endif
1298 } else {
1299 if (local_sub_ip != 0) {
1300 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1301 } else if (ip != 0) {
1302 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1303 }
1304 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1305
1306 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1307
1308#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1309 printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1310#endif
1311 }
1312 }
1313 }
1314
1315#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1316 std::cout << "partptr_sub = " << std::endl;
1317 for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1318 for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1319 std::cout << partptr_sub(i, j) << " ";
1320 }
1321 std::cout << std::endl;
1322 }
1323 std::cout << "partptr_sub end" << std::endl;
1324#endif
1325
1326 {
1327 local_ordinal_type npacks = ceil(float(nparts) / vector_length);
1328
1329 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1330 for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1331 part2packrowidx0_sub(ip, 0) = 0;
1332 }
1333 for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1334 if (ipack != 0) {
1335 local_ordinal_type ip_min = ipack * vector_length;
1336 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1337 for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1338 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1339 }
1340 }
1341
1342 for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1343 local_ordinal_type ip_min = ipack * vector_length;
1344 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1345
1346 const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1347
1348 constexpr local_ordinal_type connection_length = 2;
1349
1350 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1351 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1352
1353 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1354 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1355 if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1356
1357 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1358
1359 for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1360 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1361 }
1362 }
1363 }
1364
1365 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1366 }
1367 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1368 } else {
1369 IFPACK2_BLOCKHELPER_TIMER("compute part indices", indices);
1370 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1371 const auto *part = &partitions[p[ip]];
1372 const local_ordinal_type ipnrows = part->size();
1373 TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip - 1]].size())));
1374 TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1375 BlockHelperDetails::get_msg_prefix(comm)
1376 << "partition " << p[ip]
1377 << " is empty, which is not allowed.");
1378 // assume No overlap.
1379 part2rowidx0(ip + 1) = part2rowidx0(ip) + ipnrows;
1380 // Since parts are ordered in decreasing size, the size of the first
1381 // part in a pack is the size for all parts in the pack.
1382 if (ip % vector_length == 0) pack_nrows = ipnrows;
1383 part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1384 const local_ordinal_type offset = partptr(ip);
1385 for (local_ordinal_type i = 0; i < ipnrows; ++i) {
1386 const auto lcl_row = (*part)[i];
1387 TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1388 BlockHelperDetails::get_msg_prefix(comm)
1389 << "partitions[" << p[ip] << "]["
1390 << i << "] = " << lcl_row
1391 << " but input matrix implies limits of [0, " << A_n_lclrows - 1
1392 << "].");
1393 lclrow(offset + i) = lcl_row;
1394 rowidx2part(offset + i) = ip;
1395 if (interf.row_contiguous && offset + i > 0 && lclrow((offset + i) - 1) + 1 != lcl_row)
1396 interf.row_contiguous = false;
1397 }
1398 partptr(ip + 1) = offset + ipnrows;
1399
1400#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1401 printf("Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1402 printf("partptr(%d+1) = %d\n", ip, partptr(ip + 1));
1403#endif
1404 }
1405
1406 part2rowidx0_sub(0) = 0;
1407 partptr_sub(0, 0) = 0;
1408 // const local_ordinal_type number_pack_per_sub_part = ceil(float(nparts)/vector_length);
1409
1410 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1411 const auto *part = &partitions[p[ip]];
1412 const local_ordinal_type ipnrows = part->size();
1413 const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1414
1415 TEUCHOS_TEST_FOR_EXCEPTION(full_line_length != ipnrows, std::logic_error,
1416 "In the part " << ip);
1417
1418 constexpr local_ordinal_type connection_length = 2;
1419
1420 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1421 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
1422 "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1423
1424 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1425 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1426
1427 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1428
1429 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1430 const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1431 const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1432 if (local_sub_ip != n_subparts_per_part - 1) {
1433 if (local_sub_ip != 0) {
1434 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1435 } else if (ip != 0) {
1436 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1437 }
1438 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1439 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1440 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1441
1442 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1443 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1444
1445#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1446 printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1447 printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1448#endif
1449 } else {
1450 if (local_sub_ip != 0) {
1451 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1452 } else if (ip != 0) {
1453 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1454 }
1455 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1456
1457 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1458
1459#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1460 printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1461#endif
1462 }
1463 }
1464 }
1465
1466 {
1467 local_ordinal_type npacks = ceil(float(nparts) / vector_length);
1468
1469 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1470 for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1471 part2packrowidx0_sub(ip, 0) = 0;
1472 }
1473 for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1474 if (ipack != 0) {
1475 local_ordinal_type ip_min = ipack * vector_length;
1476 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1477 for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1478 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1479 }
1480 }
1481
1482 for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1483 local_ordinal_type ip_min = ipack * vector_length;
1484 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1485
1486 const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1487
1488 constexpr local_ordinal_type connection_length = 2;
1489
1490 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1491 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1492
1493 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1494 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1495 if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1496
1497 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1498
1499 for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1500 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1501 }
1502 }
1503 }
1504
1505 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1506 }
1507 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1508 }
1509#if defined(BLOCKTRIDICONTAINER_DEBUG)
1510 TEUCHOS_ASSERT(partptr(nparts) == nrows);
1511#endif
1512 if (lclrow(0) != 0) interf.row_contiguous = false;
1513
1514 Kokkos::deep_copy(interf.partptr, partptr);
1515 Kokkos::deep_copy(interf.lclrow, lclrow);
1516
1517 Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1518
1519 // assume No overlap. Thus:
1520 interf.part2rowidx0 = interf.partptr;
1521 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1522
1523 interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1524 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1525
1526 { // Fill packptr.
1527 IFPACK2_BLOCKHELPER_TIMER("Fill packptr", packptr0);
1528 local_ordinal_type npacks = ceil(float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1529 npacks = 0;
1530 for (local_ordinal_type ip = 1; ip <= nparts; ++ip) // n_sub_parts_and_schur
1531 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1532 ++npacks;
1533
1534 interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1535 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1536 packptr(0) = 0;
1537 for (local_ordinal_type ip = 1, k = 1; ip <= nparts; ++ip)
1538 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1539 packptr(k++) = ip;
1540
1541 Kokkos::deep_copy(interf.packptr, packptr);
1542
1543 local_ordinal_type npacks_per_subpart = ceil(float(nparts) / vector_length);
1544 npacks = ceil(float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1545
1546 interf.packindices_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_sub"), npacks_per_subpart * n_subparts_per_part);
1547 interf.packindices_schur = local_ordinal_type_2d_view(do_not_initialize_tag("packindices_schur"), npacks_per_subpart, n_subparts_per_part - 1);
1548
1549 const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1550 const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1551
1552 // Fill packindices_sub and packindices_schur
1553 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part - 1; ++local_sub_ip) {
1554 for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1555 packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1556 packindices_schur(local_pack_ip, local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1557 }
1558 }
1559
1560 for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1561 packindices_sub((n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip;
1562 }
1563
1564#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1565 std::cout << "packindices_sub = " << std::endl;
1566 for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1567 std::cout << packindices_sub(i) << " ";
1568 }
1569 std::cout << std::endl;
1570 std::cout << "packindices_sub end" << std::endl;
1571
1572 std::cout << "packindices_schur = " << std::endl;
1573 for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1574 for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1575 std::cout << packindices_schur(i, j) << " ";
1576 }
1577 std::cout << std::endl;
1578 }
1579
1580 std::cout << "packindices_schur end" << std::endl;
1581#endif
1582
1583 Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1584 Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1585
1586 interf.packptr_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1587 const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1588 packptr_sub(0) = 0;
1589 for (local_ordinal_type k = 0; k < npacks + 1; ++k)
1590 packptr_sub(k) = packptr(k % npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1591
1592 Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1593 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1594 }
1595 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1596
1597 return interf;
1598}
1599
1603template <typename MatrixType>
1606 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1607 using size_type_1d_view = typename impl_type::size_type_1d_view;
1608 using size_type_2d_view = typename impl_type::size_type_2d_view;
1609 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1610 using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1611 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1612 using internal_vector_type_3d_view = typename impl_type::internal_vector_type_3d_view;
1613
1614 // flat_td_ptr(i) is the index into flat-array values of the start of the
1615 // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1616 // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1617 // vector_length is the position in the pack.
1618 size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1619 // List of local column indices into A from which to grab
1620 // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1621 local_ordinal_type_1d_view A_colindsub;
1622 // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1623 // tridiag's pack, and i % vector_length gives the position in the pack.
1624 vector_type_3d_view values;
1625 // Schur block values. pack_td_ptr_schur(i) points to the start of the i'th
1626 // Schur's pack, and i % vector_length gives the position in the pack.
1627 vector_type_3d_view values_schur;
1628 // inv(A_00)*A_01 block values.
1629 vector_type_4d_view e_values;
1630 // If doing Schur line splitting: space for permuted version of X,
1631 // to be used during the Schur complement block solves (SolveTridiags, SingleVectorSchurTag).
1632 // Otherwise, this is not allocated.
1633 internal_vector_type_3d_view X_internal_vector_values_schur;
1634
1635 // The following are for fused block Jacobi only.
1636 // For block row i, diag_offset(i)...diag_offset(i + bs^2)
1637 // is the range of scalars for the diagonal block.
1638 size_type_1d_view diag_offsets;
1639 // For fused residual+solve block Jacobi case,
1640 // this contains the diagonal block inverses in flat, local row indexing:
1641 // d_inv(row, :, :) gives the row-major block for row.
1642 btdm_scalar_type_3d_view d_inv;
1643
1644 bool is_diagonal_only;
1645
1646 BlockTridiags() = default;
1647 BlockTridiags(const BlockTridiags &b) = default;
1648
1649 // Index into row-major block of a tridiag.
1650 template <typename idx_type>
1651 static KOKKOS_FORCEINLINE_FUNCTION
1652 idx_type
1653 IndexToRow(const idx_type &ind) { return (ind + 1) / 3; }
1654 // Given a row of a row-major tridiag, return the index of the first block
1655 // in that row.
1656 template <typename idx_type>
1657 static KOKKOS_FORCEINLINE_FUNCTION
1658 idx_type
1659 RowToIndex(const idx_type &row) { return row > 0 ? 3 * row - 1 : 0; }
1660 // Number of blocks in a tridiag having a given number of rows.
1661 template <typename idx_type>
1662 static KOKKOS_FORCEINLINE_FUNCTION
1663 idx_type
1664 NumBlocks(const idx_type &nrows) { return nrows > 0 ? 3 * nrows - 2 : 0; }
1665 // Number of blocks associated to a Schur complement having a given number of rows.
1666 template <typename idx_type>
1667 static KOKKOS_FORCEINLINE_FUNCTION
1668 idx_type
1669 NumBlocksSchur(const idx_type &nrows) { return nrows > 0 ? 3 * nrows + 2 : 0; }
1670};
1671
1675template <typename MatrixType>
1677createBlockTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf) {
1678 IFPACK2_BLOCKHELPER_TIMER("createBlockTridiags", createBlockTridiags0);
1680 using execution_space = typename impl_type::execution_space;
1681 using local_ordinal_type = typename impl_type::local_ordinal_type;
1682 using size_type = typename impl_type::size_type;
1683 using size_type_2d_view = typename impl_type::size_type_2d_view;
1684
1685 constexpr int vector_length = impl_type::vector_length;
1686
1688
1689 const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1690
1691 { // construct the flat index pointers into the tridiag values array.
1692 btdm.flat_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.flat_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1693 const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part);
1694 Kokkos::parallel_scan(
1695 "createBlockTridiags::RangePolicy::flat_td_ptr",
1696 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1697 const local_ordinal_type partidx = i / (2 * interf.n_subparts_per_part);
1698 const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1699
1700 if (final) {
1701 btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1702 }
1703 if (local_subpartidx != (2 * interf.n_subparts_per_part - 1)) {
1704 const local_ordinal_type nrows = interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 1) - interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 0);
1705 if (local_subpartidx % 2 == 0)
1706 update += btdm.NumBlocks(nrows);
1707 else
1708 update += btdm.NumBlocksSchur(nrows);
1709 }
1710 });
1711
1712 const auto nblocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts - 1, 2 * interf.n_subparts_per_part - 1));
1713 btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1714 }
1715
1716 // And the packed index pointers.
1717 if (vector_length == 1) {
1718 btdm.pack_td_ptr = btdm.flat_td_ptr;
1719 } else {
1720 // const local_ordinal_type npacks = interf.packptr_sub.extent(0) - 1;
1721
1722 local_ordinal_type npacks_per_subpart = 0;
1723 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1724 Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1725 for (local_ordinal_type ip = 1; ip <= interf.nparts; ++ip) // n_sub_parts_and_schur
1726 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1727 ++npacks_per_subpart;
1728
1729 btdm.pack_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1730 const Kokkos::RangePolicy<execution_space> policy(0, npacks_per_subpart);
1731
1732 Kokkos::parallel_for(
1733 "createBlockTridiags::RangePolicy::pack_td_ptr",
1734 policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1735 for (local_ordinal_type j = 0; j < 2 * interf.n_subparts_per_part; ++j) {
1736 const local_ordinal_type pack_id = (j == 2 * interf.n_subparts_per_part - 1) ? i + (j - 1) * npacks_per_subpart : i + j * npacks_per_subpart;
1737 const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id + 1) - interf.packptr_sub(pack_id);
1738
1739 const local_ordinal_type parti = interf.packptr_sub(pack_id);
1740 const local_ordinal_type partidx = parti % interf.nparts;
1741
1742 for (local_ordinal_type pti = 0; pti < nparts_in_pack; ++pti) {
1743 btdm.pack_td_ptr(partidx + pti, j) = btdm.flat_td_ptr(i, j);
1744 }
1745 }
1746 });
1747 }
1748
1749 btdm.pack_td_ptr_schur = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1750
1751 const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1752 constexpr local_ordinal_type connection_length = 2;
1753
1754 host_pack_td_ptr_schur(0, 0) = 0;
1755 for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1756 if (i % vector_length == 0) {
1757 if (i != 0)
1758 host_pack_td_ptr_schur(i, 0) = host_pack_td_ptr_schur(i - 1, host_pack_td_ptr_schur.extent(1) - 1);
1759 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part - 1; ++j) {
1760 host_pack_td_ptr_schur(i, j + 1) = host_pack_td_ptr_schur(i, j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part - 2 ? 1 : 0);
1761 }
1762 } else {
1763 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1764 host_pack_td_ptr_schur(i, j) = host_pack_td_ptr_schur(i - 1, j);
1765 }
1766 }
1767 }
1768
1769 Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1770
1771#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1772 const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1773 std::cout << "flat_td_ptr = " << std::endl;
1774 for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1775 for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1776 std::cout << host_flat_td_ptr(i, j) << " ";
1777 }
1778 std::cout << std::endl;
1779 }
1780 std::cout << "flat_td_ptr end" << std::endl;
1781
1782 const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1783
1784 std::cout << "pack_td_ptr = " << std::endl;
1785 for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1786 for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1787 std::cout << host_pack_td_ptr(i, j) << " ";
1788 }
1789 std::cout << std::endl;
1790 }
1791 std::cout << "pack_td_ptr end" << std::endl;
1792
1793 std::cout << "pack_td_ptr_schur = " << std::endl;
1794 for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1795 for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1796 std::cout << host_pack_td_ptr_schur(i, j) << " ";
1797 }
1798 std::cout << std::endl;
1799 }
1800 std::cout << "pack_td_ptr_schur end" << std::endl;
1801#endif
1802
1803 // values and A_colindsub are created in the symbolic phase
1804 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1805
1806 return btdm;
1807}
1808
1809// Set the tridiags to be I to the full pack block size. That way, if a
1810// tridiag within a pack is shorter than the longest one, the extra blocks are
1811// processed in a safe way. Similarly, in the solve phase, if the extra blocks
1812// in the packed multvector are 0, and the tridiag LU reflects the extra I
1813// blocks, then the solve proceeds as though the extra blocks aren't
1814// present. Since this extra work is part of the SIMD calls, it's not actually
1815// extra work. Instead, it means we don't have to put checks or masks in, or
1816// quiet NaNs. This functor has to be called just once, in the symbolic phase,
1817// since the numeric phase fills in only the used entries, leaving these I
1818// blocks intact.
1819template <typename MatrixType>
1820void setTridiagsToIdentity(const BlockTridiags<MatrixType> &btdm,
1821 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view &packptr) {
1823 using execution_space = typename impl_type::execution_space;
1824 using local_ordinal_type = typename impl_type::local_ordinal_type;
1825 using size_type_2d_view = typename impl_type::size_type_2d_view;
1826
1827 const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1828 const local_ordinal_type blocksize = btdm.values.extent(1);
1829
1830 {
1831 const int vector_length = impl_type::vector_length;
1832 const int internal_vector_length = impl_type::internal_vector_length;
1833
1834 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1835 using internal_vector_type = typename impl_type::internal_vector_type;
1836 using internal_vector_type_4d_view =
1837 typename impl_type::internal_vector_type_4d_view;
1838
1839 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1840 const internal_vector_type_4d_view values(reinterpret_cast<internal_vector_type *>(btdm.values.data()),
1841 btdm.values.extent(0),
1842 btdm.values.extent(1),
1843 btdm.values.extent(2),
1844 vector_length / internal_vector_length);
1845 const local_ordinal_type vector_loop_size = values.extent(3);
1846#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1847 local_ordinal_type total_team_size(0);
1848 if (blocksize <= 5)
1849 total_team_size = 32;
1850 else if (blocksize <= 9)
1851 total_team_size = 64;
1852 else if (blocksize <= 12)
1853 total_team_size = 96;
1854 else if (blocksize <= 16)
1855 total_team_size = 128;
1856 else if (blocksize <= 20)
1857 total_team_size = 160;
1858 else
1859 total_team_size = 160;
1860 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1861 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1862#elif defined(KOKKOS_ENABLE_HIP)
1863 // FIXME: HIP
1864 // These settings might be completely wrong
1865 // will have to do some experiments to decide
1866 // what makes sense on AMD GPUs
1867 local_ordinal_type total_team_size(0);
1868 if (blocksize <= 5)
1869 total_team_size = 32;
1870 else if (blocksize <= 9)
1871 total_team_size = 64;
1872 else if (blocksize <= 12)
1873 total_team_size = 96;
1874 else if (blocksize <= 16)
1875 total_team_size = 128;
1876 else if (blocksize <= 20)
1877 total_team_size = 160;
1878 else
1879 total_team_size = 160;
1880 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1881 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1882#elif defined(KOKKOS_ENABLE_SYCL)
1883 // SYCL: FIXME
1884 local_ordinal_type total_team_size(0);
1885 if (blocksize <= 5)
1886 total_team_size = 32;
1887 else if (blocksize <= 9)
1888 total_team_size = 64;
1889 else if (blocksize <= 12)
1890 total_team_size = 96;
1891 else if (blocksize <= 16)
1892 total_team_size = 128;
1893 else if (blocksize <= 20)
1894 total_team_size = 160;
1895 else
1896 total_team_size = 160;
1897 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1898 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1899#else
1900 // Host architecture: team size is always one
1901 const team_policy_type policy(packptr.extent(0) - 1, 1, 1);
1902#endif
1903 Kokkos::parallel_for(
1904 "setTridiagsToIdentity::TeamPolicy",
1905 policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1906 const local_ordinal_type k = member.league_rank();
1907 const local_ordinal_type ibeg = pack_td_ptr(packptr(k), 0);
1908 const local_ordinal_type iend = pack_td_ptr(packptr(k), pack_td_ptr.extent(1) - 1);
1909
1910 const local_ordinal_type diff = iend - ibeg;
1911 const local_ordinal_type icount = diff / 3 + (diff % 3 > 0);
1912 const btdm_scalar_type one(1);
1913 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
1914 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, icount), [&](const local_ordinal_type &ii) {
1915 const local_ordinal_type i = ibeg + ii * 3;
1916 for (local_ordinal_type j = 0; j < blocksize; ++j) {
1917 values(i, j, j, v) = one;
1918 }
1919 });
1920 });
1921 });
1922 }
1923}
1924
1928template <typename MatrixType>
1929void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1930 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1931 const BlockHelperDetails::PartInterface<MatrixType> &interf,
1934 const bool overlap_communication_and_computation,
1935 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
1936 bool useSeqMethod,
1937 bool use_fused_jacobi) {
1938 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase", SymbolicPhase);
1939
1941
1942 using execution_space = typename impl_type::execution_space;
1943 using host_execution_space = typename impl_type::host_execution_space;
1944
1945 using local_ordinal_type = typename impl_type::local_ordinal_type;
1946 using global_ordinal_type = typename impl_type::global_ordinal_type;
1947 using size_type = typename impl_type::size_type;
1948 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1949 using size_type_1d_view = typename impl_type::size_type_1d_view;
1950 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1951 using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1952 using internal_vector_type_3d_view = typename impl_type::internal_vector_type_3d_view;
1953 using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
1954 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1955 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1956
1957 constexpr int vector_length = impl_type::vector_length;
1958 constexpr int internal_vector_length = impl_type::internal_vector_length;
1959
1960 const auto comm = A->getRowMap()->getComm();
1961
1962 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
1963 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
1964
1965 bool hasBlockCrsMatrix = !A_bcrs.is_null();
1966 TEUCHOS_ASSERT(hasBlockCrsMatrix || g->getLocalNumRows() != 0);
1967 const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows() / g->getLocalNumRows();
1968
1969 // mirroring to host
1970 const auto partptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.partptr);
1971 const auto lclrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.lclrow);
1972 const auto rowidx2part = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.rowidx2part);
1973 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1974 const auto packptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.packptr);
1975
1976 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1977
1978 Kokkos::View<local_ordinal_type *, host_execution_space> col2row("col2row", A->getLocalNumCols());
1979
1980 // find column to row map on host
1981
1982 Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1983 {
1984 const auto rowmap = g->getRowMap();
1985 const auto colmap = g->getColMap();
1986 const auto dommap = g->getDomainMap();
1987 TEUCHOS_ASSERT(!(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1988 rowmap->lazyPushToHost();
1989 colmap->lazyPushToHost();
1990 dommap->lazyPushToHost();
1991
1992#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1993 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
1994 Kokkos::parallel_for(
1995 "performSymbolicPhase::RangePolicy::col2row",
1996 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1997 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1998 TEUCHOS_ASSERT(gid != Teuchos::OrdinalTraits<global_ordinal_type>::invalid());
1999 if (dommap->isNodeGlobalElement(gid)) {
2000 const local_ordinal_type lc = colmap->getLocalElement(gid);
2001#if defined(BLOCKTRIDICONTAINER_DEBUG)
2002 TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits<local_ordinal_type>::invalid(),
2003 BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid
2004 << " gives an invalid local column.");
2005#endif
2006 col2row(lc) = lr;
2007 }
2008 });
2009#endif
2010 }
2011
2012 // construct the D and R graphs in A = D + R.
2013 {
2014 const auto local_graph = g->getLocalGraphHost();
2015 const auto local_graph_rowptr = local_graph.row_map;
2016 TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
2017 const auto local_graph_colidx = local_graph.entries;
2018
2019 // assume no overlap.
2020
2021 Kokkos::View<local_ordinal_type *, host_execution_space> lclrow2idx("lclrow2idx", nrows);
2022 {
2023 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2024 Kokkos::parallel_for(
2025 "performSymbolicPhase::RangePolicy::lclrow2idx",
2026 policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
2027 lclrow2idx[lclrow(i)] = i;
2028 });
2029 }
2030
2031 // count (block) nnzs in D and R.
2033 typename sum_reducer_type::value_type sum_reducer_value;
2034 {
2035 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2036 Kokkos::parallel_reduce
2037 // profiling interface does not work
2038 ( //"performSymbolicPhase::RangePolicy::count_nnz",
2039 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
2040 // LID -> index.
2041 const local_ordinal_type ri0 = lclrow2idx[lr];
2042 const local_ordinal_type pi0 = rowidx2part(ri0);
2043 for (size_type j = local_graph_rowptr(lr); j < local_graph_rowptr(lr + 1); ++j) {
2044 const local_ordinal_type lc = local_graph_colidx(j);
2045 const local_ordinal_type lc2r = col2row[lc];
2046 bool incr_R = false;
2047 do { // breakable
2048 if (lc2r == (local_ordinal_type)-1) {
2049 incr_R = true;
2050 break;
2051 }
2052 const local_ordinal_type ri = lclrow2idx[lc2r];
2053 const local_ordinal_type pi = rowidx2part(ri);
2054 if (pi != pi0) {
2055 incr_R = true;
2056 break;
2057 }
2058 // Test for being in the tridiag. This is done in index space. In
2059 // LID space, tridiag LIDs in a row are not necessarily related by
2060 // {-1, 0, 1}.
2061 if (ri0 + 1 >= ri && ri0 <= ri + 1)
2062 ++update.v[0]; // D_nnz
2063 else
2064 incr_R = true;
2065 } while (0);
2066 if (incr_R) {
2067 if (lc < nrows)
2068 ++update.v[1]; // R_nnz_owned
2069 else
2070 ++update.v[2]; // R_nnz_remote
2071 }
2072 }
2073 },
2074 sum_reducer_type(sum_reducer_value));
2075 }
2076 size_type D_nnz = sum_reducer_value.v[0];
2077 size_type R_nnz_owned = sum_reducer_value.v[1];
2078 size_type R_nnz_remote = sum_reducer_value.v[2];
2079
2080 if (!overlap_communication_and_computation) {
2081 R_nnz_owned += R_nnz_remote;
2082 R_nnz_remote = 0;
2083 }
2084
2085 // construct the D_00 graph.
2086 {
2087 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
2088
2089 btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
2090 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
2091
2092#if defined(BLOCKTRIDICONTAINER_DEBUG)
2093 Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2094#endif
2095
2096 const local_ordinal_type nparts = partptr.extent(0) - 1;
2097
2098 {
2099 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
2100 Kokkos::parallel_for(
2101 "performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
2102 policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
2103 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2104 local_ordinal_type offset = 0;
2105 for (local_ordinal_type ri0 = partptr(pi0); ri0 < partptr(pi0 + 1); ++ri0) {
2106 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2107 offset = 1;
2108 const local_ordinal_type lr0 = lclrow(ri0);
2109 const size_type j0 = local_graph_rowptr(lr0);
2110 for (size_type j = j0; j < local_graph_rowptr(lr0 + 1); ++j) {
2111 const local_ordinal_type lc = local_graph_colidx(j);
2112 const local_ordinal_type lc2r = col2row[lc];
2113 if (lc2r == (local_ordinal_type)-1) continue;
2114 const local_ordinal_type ri = lclrow2idx[lc2r];
2115 const local_ordinal_type pi = rowidx2part(ri);
2116 if (pi != pi0) continue;
2117 if (ri + 1 < ri0 || ri > ri0 + 1) continue;
2118 const local_ordinal_type row_entry = j - j0;
2119 D_A_colindsub(flat_td_ptr(pi0, 0) + ((td_row_os + ri) - ri0)) = row_entry;
2120 }
2121 }
2122 });
2123 }
2124#if defined(BLOCKTRIDICONTAINER_DEBUG)
2125 for (size_t i = 0; i < D_A_colindsub.extent(0); ++i)
2126 TEUCHOS_ASSERT(D_A_colindsub(i) != Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2127#endif
2128 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2129
2130 // Allocate values.
2131 {
2132 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0) - 1, btdm.pack_td_ptr.extent(1) - 1);
2133 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2134 btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
2135
2136 if (interf.n_subparts_per_part > 1) {
2137 const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0) - 1, btdm.pack_td_ptr_schur.extent(1) - 1);
2138 const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2139 btdm.values_schur = vector_type_3d_view("btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2140 }
2141
2142 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2143 }
2144 }
2145
2146 // Construct the R graph.
2147 {
2148 amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
2149 amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
2150
2151 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2152 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2153
2154 amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2155 amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
2156
2157 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2158 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2159
2160 {
2161 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2162 Kokkos::parallel_for(
2163 "performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2164 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
2165 const local_ordinal_type ri0 = lclrow2idx[lr];
2166 const local_ordinal_type pi0 = rowidx2part(ri0);
2167 const size_type j0 = local_graph_rowptr(lr);
2168 for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2169 const local_ordinal_type lc = local_graph_colidx(j);
2170 const local_ordinal_type lc2r = col2row[lc];
2171 if (lc2r != (local_ordinal_type)-1) {
2172 const local_ordinal_type ri = lclrow2idx[lc2r];
2173 const local_ordinal_type pi = rowidx2part(ri);
2174 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2175 continue;
2176 }
2177 }
2178 // exclusive scan will be performed later
2179 if (!overlap_communication_and_computation || lc < nrows) {
2180 ++R_rowptr(lr);
2181 } else {
2182 ++R_rowptr_remote(lr);
2183 }
2184 }
2185 });
2186 }
2187
2188 // exclusive scan
2190 {
2191 Kokkos::RangePolicy<host_execution_space> policy(0, nrows + 1);
2192 Kokkos::parallel_scan(
2193 "performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2194 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, update_type &update, const bool &final) {
2195 update_type val;
2196 val.v[0] = R_rowptr(lr);
2197 if (overlap_communication_and_computation)
2198 val.v[1] = R_rowptr_remote(lr);
2199
2200 if (final) {
2201 R_rowptr(lr) = update.v[0];
2202 if (overlap_communication_and_computation)
2203 R_rowptr_remote(lr) = update.v[1];
2204
2205 if (lr < nrows) {
2206 const local_ordinal_type ri0 = lclrow2idx[lr];
2207 const local_ordinal_type pi0 = rowidx2part(ri0);
2208
2209 size_type cnt_rowptr = R_rowptr(lr);
2210 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
2211
2212 const size_type j0 = local_graph_rowptr(lr);
2213 for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2214 const local_ordinal_type lc = local_graph_colidx(j);
2215 const local_ordinal_type lc2r = col2row[lc];
2216 if (lc2r != (local_ordinal_type)-1) {
2217 const local_ordinal_type ri = lclrow2idx[lc2r];
2218 const local_ordinal_type pi = rowidx2part(ri);
2219 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2220 continue;
2221 }
2222 const local_ordinal_type row_entry = j - j0;
2223 if (!overlap_communication_and_computation || lc < nrows)
2224 R_A_colindsub(cnt_rowptr++) = row_entry;
2225 else
2226 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2227 }
2228 }
2229 }
2230 update += val;
2231 });
2232 }
2233 TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
2234 Kokkos::deep_copy(amd.rowptr, R_rowptr);
2235 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2236 if (overlap_communication_and_computation) {
2237 TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
2238 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2239 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2240 }
2241
2242 // Allocate or view values.
2243 if (hasBlockCrsMatrix)
2244 amd.tpetra_values = (const_cast<block_crs_matrix_type *>(A_bcrs.get())->getValuesDeviceNonConst());
2245 else {
2246 amd.tpetra_values = (const_cast<crs_matrix_type *>(A_crs.get()))->getLocalValuesDevice(Tpetra::Access::ReadWrite);
2247 }
2248 }
2249
2250 if (interf.n_subparts_per_part > 1) {
2251 // If doing Schur complement line splitting, allocate E and space for permuted X
2252 btdm.e_values = vector_type_4d_view("btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2253 btdm.X_internal_vector_values_schur = internal_vector_type_3d_view(
2254 do_not_initialize_tag("X_internal_vector_values_schur"),
2255 2 * (interf.n_subparts_per_part - 1) * interf.part2packrowidx0_sub.extent(0),
2256 blocksize,
2257 vector_length / internal_vector_length);
2258 }
2259 }
2260 // Precompute offsets of each A and x entry to speed up residual.
2261 // Applies if all of these are true:
2262 // - hasBlockCrsMatrix
2263 // - execution_space is a GPU
2264 // - !useSeqMethod (since this uses a different scheme for indexing A,x)
2265 //
2266 // Reading A, x take up to 4 and 6 levels of indirection respectively,
2267 // but precomputing the offsets reduces it to 2 for both (get index, then value)
2268 if (BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix) {
2269 bool is_async_importer_active = !async_importer.is_null();
2270 local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2271 bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2272 BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2273 }
2274
2275 // If using fused block Jacobi path, allocate diagonal inverses here (d_inv) and find diagonal offsets.
2276 if (use_fused_jacobi) {
2277 btdm.d_inv = btdm_scalar_type_3d_view(do_not_initialize_tag("btdm.d_inv"), interf.nparts, blocksize, blocksize);
2278 auto rowptrs = A_bcrs->getCrsGraph().getLocalRowPtrsDevice();
2279 auto entries = A_bcrs->getCrsGraph().getLocalIndicesDevice();
2280 btdm.diag_offsets = BlockHelperDetails::findDiagOffsets<execution_space, size_type_1d_view>(rowptrs, entries, interf.nparts, blocksize);
2281 }
2282 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2283}
2284
2288template <typename ArgActiveExecutionMemorySpace>
2290
2291template <>
2292struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2293 typedef KB::Mode::Serial mode_type;
2294#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2295 typedef KB::Algo::Level3::CompactMKL algo_type;
2296#else
2297 typedef KB::Algo::Level3::Blocked algo_type;
2298#endif
2299 static int recommended_team_size(const int /* blksize */,
2300 const int /* vector_length */,
2301 const int /* internal_vector_length */) {
2302 return 1;
2303 }
2304};
2305
2306#if defined(KOKKOS_ENABLE_CUDA)
2307static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
2308 const int vector_length,
2309 const int internal_vector_length) {
2310 const int vector_size = vector_length / internal_vector_length;
2311 int total_team_size(0);
2312 if (blksize <= 5)
2313 total_team_size = 32;
2314 else if (blksize <= 9)
2315 total_team_size = 32; // 64
2316 else if (blksize <= 12)
2317 total_team_size = 96;
2318 else if (blksize <= 16)
2319 total_team_size = 128;
2320 else if (blksize <= 20)
2321 total_team_size = 160;
2322 else
2323 total_team_size = 160;
2324 return 2 * total_team_size / vector_size;
2325}
2326template <>
2327struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2328 typedef KB::Mode::Team mode_type;
2329 typedef KB::Algo::Level3::Unblocked algo_type;
2330 static int recommended_team_size(const int blksize,
2331 const int vector_length,
2332 const int internal_vector_length) {
2333 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2334 }
2335};
2336template <>
2337struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2338 typedef KB::Mode::Team mode_type;
2339 typedef KB::Algo::Level3::Unblocked algo_type;
2340 static int recommended_team_size(const int blksize,
2341 const int vector_length,
2342 const int internal_vector_length) {
2343 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2344 }
2345};
2346#endif
2347
2348#if defined(KOKKOS_ENABLE_HIP)
2349static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
2350 const int vector_length,
2351 const int internal_vector_length) {
2352 const int vector_size = vector_length / internal_vector_length;
2353 int total_team_size(0);
2354 if (blksize <= 5)
2355 total_team_size = 32;
2356 else if (blksize <= 9)
2357 total_team_size = 32; // 64
2358 else if (blksize <= 12)
2359 total_team_size = 96;
2360 else if (blksize <= 16)
2361 total_team_size = 128;
2362 else if (blksize <= 20)
2363 total_team_size = 160;
2364 else
2365 total_team_size = 160;
2366 return 2 * total_team_size / vector_size;
2367}
2368template <>
2369struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2370 typedef KB::Mode::Team mode_type;
2371 typedef KB::Algo::Level3::Unblocked algo_type;
2372 static int recommended_team_size(const int blksize,
2373 const int vector_length,
2374 const int internal_vector_length) {
2375 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2376 }
2377};
2378template <>
2379struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2380 typedef KB::Mode::Team mode_type;
2381 typedef KB::Algo::Level3::Unblocked algo_type;
2382 static int recommended_team_size(const int blksize,
2383 const int vector_length,
2384 const int internal_vector_length) {
2385 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2386 }
2387};
2388#endif
2389
2390#if defined(KOKKOS_ENABLE_SYCL)
2391static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(const int blksize,
2392 const int vector_length,
2393 const int internal_vector_length) {
2394 const int vector_size = vector_length / internal_vector_length;
2395 int total_team_size(0);
2396 if (blksize <= 5)
2397 total_team_size = 32;
2398 else if (blksize <= 9)
2399 total_team_size = 32; // 64
2400 else if (blksize <= 12)
2401 total_team_size = 96;
2402 else if (blksize <= 16)
2403 total_team_size = 128;
2404 else if (blksize <= 20)
2405 total_team_size = 160;
2406 else
2407 total_team_size = 160;
2408 return 2 * total_team_size / vector_size;
2409}
2410template <>
2411struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2412 typedef KB::Mode::Team mode_type;
2413 typedef KB::Algo::Level3::Unblocked algo_type;
2414 static int recommended_team_size(const int blksize,
2415 const int vector_length,
2416 const int internal_vector_length) {
2417 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2418 }
2419};
2420template <>
2421struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2422 typedef KB::Mode::Team mode_type;
2423 typedef KB::Algo::Level3::Unblocked algo_type;
2424 static int recommended_team_size(const int blksize,
2425 const int vector_length,
2426 const int internal_vector_length) {
2427 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2428 }
2429};
2430#endif
2431
2432template <typename impl_type, typename WWViewType>
2433KOKKOS_INLINE_FUNCTION void
2434solveMultiVector(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2435 const typename impl_type::local_ordinal_type & /* blocksize */,
2436 const typename impl_type::local_ordinal_type &i0,
2437 const typename impl_type::local_ordinal_type &r0,
2438 const typename impl_type::local_ordinal_type &nrows,
2439 const typename impl_type::local_ordinal_type &v,
2440 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2441 const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2442 const WWViewType &WW,
2443 const bool skip_first_pass = false) {
2444 using execution_space = typename impl_type::execution_space;
2445 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2446 using member_type = typename team_policy_type::member_type;
2447 using local_ordinal_type = typename impl_type::local_ordinal_type;
2448
2449 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2450
2451 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2452 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2453
2454 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2455
2456 // constant
2457 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
2458 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
2459
2460 // subview pattern
2461 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2462 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2463 auto X2 = X1;
2464
2465 local_ordinal_type i = i0, r = r0;
2466
2467 if (nrows > 1) {
2468 // solve Lx = x
2469 if (skip_first_pass) {
2470 i += (nrows - 2) * 3;
2471 r += (nrows - 2);
2472 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2473 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2474 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2475 KB::Trsm<member_type,
2476 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2477 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2478 X1.assign_data(X2.data());
2479 i += 3;
2480 } else {
2481 KB::Trsm<member_type,
2482 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2483 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2484 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
2485 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2486 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2487 member.team_barrier();
2488 KB::Gemm<member_type,
2489 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2490 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2491 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2492 KB::Trsm<member_type,
2493 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2494 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2495 X1.assign_data(X2.data());
2496 }
2497 }
2498
2499 // solve Ux = x
2500 KB::Trsm<member_type,
2501 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2502 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2503 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2504 i -= 3;
2505 A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
2506 X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
2507 member.team_barrier();
2508 KB::Gemm<member_type,
2509 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2510 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2511
2512 A.assign_data(&D_internal_vector_values(i, 0, 0, v));
2513 KB::Trsm<member_type,
2514 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2515 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2516 X1.assign_data(X2.data());
2517 }
2518 } else {
2519 // matrix is already inverted
2520 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2521 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
2522 member.team_barrier();
2523 KB::Gemm<member_type,
2524 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2525 default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
2526 }
2527}
2528
2529template <typename impl_type, typename WWViewType, typename XViewType>
2530KOKKOS_INLINE_FUNCTION void
2531solveSingleVectorNew(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2532 const typename impl_type::local_ordinal_type &blocksize,
2533 const typename impl_type::local_ordinal_type &i0,
2534 const typename impl_type::local_ordinal_type &r0,
2535 const typename impl_type::local_ordinal_type &nrows,
2536 const typename impl_type::local_ordinal_type &v,
2537 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2538 const XViewType &X_internal_vector_values, // Unmanaged<typename impl_type::internal_vector_type_4d_view>
2539 const WWViewType &WW) {
2540 using execution_space = typename impl_type::execution_space;
2541 // using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2542 // using member_type = typename team_policy_type::member_type;
2543 using local_ordinal_type = typename impl_type::local_ordinal_type;
2544
2545 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2546
2547 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2548 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2549
2550 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2551
2552 // base pointers
2553 auto A = D_internal_vector_values.data();
2554 auto X = X_internal_vector_values.data();
2555
2556 // constant
2557 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
2558 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
2559 // const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2560
2561 // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2562 const local_ordinal_type astep = D_internal_vector_values.stride(0);
2563 const local_ordinal_type as0 = D_internal_vector_values.stride(1); // blocksize*vector_length;
2564 const local_ordinal_type as1 = D_internal_vector_values.stride(2); // vector_length;
2565 const local_ordinal_type xstep = X_internal_vector_values.stride(0);
2566 const local_ordinal_type xs0 = X_internal_vector_values.stride(1); // vector_length;
2567
2568 // move to starting point
2569 A += i0 * astep + v;
2570 X += r0 * xstep + v;
2571
2572 // for (local_ordinal_type col=0;col<num_vectors;++col)
2573 if (nrows > 1) {
2574 // solve Lx = x
2575 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2576 member,
2577 KB::Diag::Unit,
2578 blocksize, blocksize,
2579 one,
2580 A, as0, as1,
2581 X, xs0);
2582
2583 for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
2584 member.team_barrier();
2585 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2586 member,
2587 blocksize, blocksize,
2588 -one,
2589 A + 2 * astep, as0, as1,
2590 X, xs0,
2591 one,
2592 X + 1 * xstep, xs0);
2593 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2594 member,
2595 KB::Diag::Unit,
2596 blocksize, blocksize,
2597 one,
2598 A + 3 * astep, as0, as1,
2599 X + 1 * xstep, xs0);
2600
2601 A += 3 * astep;
2602 X += 1 * xstep;
2603 }
2604
2605 // solve Ux = x
2606 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2607 member,
2608 KB::Diag::NonUnit,
2609 blocksize, blocksize,
2610 one,
2611 A, as0, as1,
2612 X, xs0);
2613
2614 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2615 A -= 3 * astep;
2616 member.team_barrier();
2617 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2618 member,
2619 blocksize, blocksize,
2620 -one,
2621 A + 1 * astep, as0, as1,
2622 X, xs0,
2623 one,
2624 X - 1 * xstep, xs0);
2625 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2626 member,
2627 KB::Diag::NonUnit,
2628 blocksize, blocksize,
2629 one,
2630 A, as0, as1,
2631 X - 1 * xstep, xs0);
2632 X -= 1 * xstep;
2633 }
2634 // for multiple rhs
2635 // X += xs1;
2636 } else {
2637 const local_ordinal_type ws0 = WW.stride(0);
2638 auto W = WW.data() + v;
2639 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
2640 member, blocksize, X, xs0, W, ws0);
2641 member.team_barrier();
2642 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2643 member,
2644 blocksize, blocksize,
2645 one,
2646 A, as0, as1,
2647 W, xs0,
2648 zero,
2649 X, xs0);
2650 }
2651}
2652
2653template <typename local_ordinal_type, typename ViewType>
2654void writeBTDValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2655#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2656 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2657 std::ofstream myfile;
2658 myfile.open(fileName);
2659
2660 const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type)scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2661 local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2662 const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2663 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2664
2665 const local_ordinal_type block_size = scalar_values.extent(1);
2666
2667 const local_ordinal_type n_rows_per_part = (n_blocks_per_part + 2) / 3 * block_size;
2668 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2669
2670 const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2671
2672 myfile << "%%MatrixMarket matrix coordinate real general" << std::endl;
2673 myfile << "%%nnz = " << nnz;
2674 myfile << " block size = " << block_size;
2675 myfile << " number of blocks = " << n_blocks;
2676 myfile << " number of parts = " << n_parts;
2677 myfile << " number of blocks per part = " << n_blocks_per_part;
2678 myfile << " number of rows = " << n_rows;
2679 myfile << " number of cols = " << n_rows;
2680 myfile << " number of packs = " << n_packs << std::endl;
2681
2682 myfile << n_rows << " " << n_rows << " " << nnz << std::setprecision(9) << std::endl;
2683
2684 local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2685 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2686 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2687 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2688 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2689 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2690 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2691 continue;
2692 if (i_block_in_part % 3 == 0) {
2693 current_row_offset = i_block_in_part / 3 * block_size;
2694 current_col_offset = i_block_in_part / 3 * block_size;
2695 } else if (i_block_in_part % 3 == 1) {
2696 current_row_offset = (i_block_in_part - 1) / 3 * block_size;
2697 current_col_offset = ((i_block_in_part - 1) / 3 + 1) * block_size;
2698 } else if (i_block_in_part % 3 == 2) {
2699 current_row_offset = ((i_block_in_part - 2) / 3 + 1) * block_size;
2700 current_col_offset = (i_block_in_part - 2) / 3 * block_size;
2701 }
2702 current_row_offset += current_part_idx * n_rows_per_part;
2703 current_col_offset += current_part_idx * n_rows_per_part;
2704 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2705 for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2706 current_row = current_row_offset + i_in_block + 1;
2707 current_col = current_col_offset + j_in_block + 1;
2708 myfile << current_row << " " << current_col << " " << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2709 }
2710 }
2711 }
2712 }
2713 }
2714
2715 myfile.close();
2716#endif
2717}
2718
2719template <typename local_ordinal_type, typename ViewType>
2720void write4DMultiVectorValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2721#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2722 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2723 std::ofstream myfile;
2724 myfile.open(fileName);
2725
2726 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2727 const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2728 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2729
2730 const local_ordinal_type block_size = scalar_values.extent(1);
2731 const local_ordinal_type n_cols = scalar_values.extent(2);
2732
2733 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2734 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2735
2736 const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2737
2738 myfile << "%%MatrixMarket matrix array real general" << std::endl;
2739 myfile << "%%block size = " << block_size;
2740 myfile << " number of blocks = " << n_blocks;
2741 myfile << " number of parts = " << n_parts;
2742 myfile << " number of blocks per part = " << n_blocks_per_part;
2743 myfile << " number of rows = " << n_rows;
2744 myfile << " number of cols = " << n_cols;
2745 myfile << " number of packs = " << n_packs << std::endl;
2746
2747 myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2748
2749 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2750 (void)current_row_offset;
2751 (void)current_part_idx;
2752 for (local_ordinal_type j_in_block = 0; j_in_block < n_cols; ++j_in_block) {
2753 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2754 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2755 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2756 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2757 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2758
2759 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2760 continue;
2761 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2762 myfile << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2763 }
2764 }
2765 }
2766 }
2767 }
2768 myfile.close();
2769#endif
2770}
2771
2772template <typename local_ordinal_type, typename ViewType>
2773void write5DMultiVectorValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2774#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2775 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2776 std::ofstream myfile;
2777 myfile.open(fileName);
2778
2779 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2780 const local_ordinal_type n_blocks = scalar_values.extent(1) * n_parts_per_pack;
2781 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2782
2783 const local_ordinal_type block_size = scalar_values.extent(2);
2784 const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2785 const local_ordinal_type n_cols = n_blocks_cols * block_size;
2786
2787 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2788 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2789
2790 const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2791
2792 myfile << "%%MatrixMarket matrix array real general" << std::endl;
2793 myfile << "%%block size = " << block_size;
2794 myfile << " number of blocks = " << n_blocks;
2795 myfile << " number of parts = " << n_parts;
2796 myfile << " number of blocks per part = " << n_blocks_per_part;
2797 myfile << " number of rows = " << n_rows;
2798 myfile << " number of cols = " << n_cols;
2799 myfile << " number of packs = " << n_packs << std::endl;
2800
2801 myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2802
2803 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2804 (void)current_row_offset;
2805 (void)current_part_idx;
2806 for (local_ordinal_type i_block_col = 0; i_block_col < n_blocks_cols; ++i_block_col) {
2807 for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2808 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2809 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2810 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2811 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2812 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2813
2814 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(1))
2815 continue;
2816 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2817 myfile << scalar_values(i_block_col, current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2818 }
2819 }
2820 }
2821 }
2822 }
2823 }
2824 myfile.close();
2825#endif
2826}
2827
2828template <typename local_ordinal_type, typename member_type, typename ViewType1, typename ViewType2>
2829KOKKOS_INLINE_FUNCTION void
2830copy3DView(const member_type &member, const ViewType1 &view1, const ViewType2 &view2) {
2831 /*
2832 // Kokkos::Experimental::local_deep_copy
2833 auto teamVectorRange =
2834 Kokkos::TeamVectorMDRange<Kokkos::Rank<3>, member_type>(
2835 member, view1.extent(0), view1.extent(1), view1.extent(2));
2836
2837 Kokkos::parallel_for
2838 (teamVectorRange,
2839 [&](const local_ordinal_type &i, const local_ordinal_type &j, const local_ordinal_type &k) {
2840 view1(i,j,k) = view2(i,j,k);
2841 });
2842 */
2843 Kokkos::Experimental::local_deep_copy(member, view1, view2);
2844}
2845template <typename MatrixType, int ScratchLevel>
2846struct ExtractAndFactorizeTridiags {
2847 public:
2848 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2849 // a functor cannot have both device_type and execution_space; specialization error in kokkos
2850 using execution_space = typename impl_type::execution_space;
2851 using memory_space = typename impl_type::memory_space;
2853 using local_ordinal_type = typename impl_type::local_ordinal_type;
2854 using size_type = typename impl_type::size_type;
2855 using impl_scalar_type = typename impl_type::impl_scalar_type;
2856 using magnitude_type = typename impl_type::magnitude_type;
2858 using row_matrix_type = typename impl_type::tpetra_row_matrix_type;
2859 using crs_graph_type = typename impl_type::tpetra_crs_graph_type;
2861 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2862 using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
2863 using size_type_1d_view = typename impl_type::size_type_1d_view;
2864 using size_type_2d_view = typename impl_type::size_type_2d_view;
2865 using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
2867 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2868 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2869 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2870 using vector_type_4d_view = typename impl_type::vector_type_4d_view;
2871 using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2872 using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
2873 using btdm_scalar_type_2d_view = typename impl_type::btdm_scalar_type_2d_view;
2874 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
2875 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2876 using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view;
2877 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2878 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2879 using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
2880 using local_crs_graph_type = typename impl_type::local_crs_graph_type;
2881 using colinds_view = typename local_crs_graph_type::entries_type;
2882
2883 using internal_vector_type = typename impl_type::internal_vector_type;
2884 static constexpr int vector_length = impl_type::vector_length;
2885 static constexpr int internal_vector_length = impl_type::internal_vector_length;
2886 static_assert(vector_length >= internal_vector_length, "Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2887 static_assert(vector_length % internal_vector_length == 0, "Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2888 // half_vector_length is used for block Jacobi factorization.
2889 // Shared memory requirement is twice as large (per vector lane) as for general tridi factorization, so
2890 // reducing vector length (if possible) keeps the shared requirement constant. This avoids the performance
2891 // cliff of switching from level 0 to level 1 scratch.
2892 static constexpr int half_vector_length = impl_type::half_vector_length;
2893
2895 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2896 using member_type = typename team_policy_type::member_type;
2897
2898 private:
2899 // part interface
2900 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2901 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2902 const local_ordinal_type max_partsz;
2903 // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
2904 using size_type_1d_view_tpetra = Kokkos::View<size_t *, typename impl_type::node_device_type>;
2905 ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2906 ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2907 ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2908 // block tridiags
2909 const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2910 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2911 const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2912 const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2913 const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2914 const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2915 const Unmanaged<btdm_scalar_type_3d_view> d_inv;
2916 const Unmanaged<size_type_1d_view> diag_offsets;
2917 // shared information
2918 const local_ordinal_type blocksize, blocksize_square;
2919 // diagonal safety
2920 const magnitude_type tiny;
2921 const local_ordinal_type vector_loop_size;
2922
2923 bool hasBlockCrsMatrix;
2924
2925 public:
2926 ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
2927 const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2928 const Teuchos::RCP<const row_matrix_type> &A_,
2929 const Teuchos::RCP<const crs_graph_type> &G_,
2930 const magnitude_type &tiny_)
2931 : // interface
2932 partptr(interf_.partptr)
2933 , lclrow(interf_.lclrow)
2934 , packptr(interf_.packptr)
2935 , packindices_sub(interf_.packindices_sub)
2936 , packptr_sub(interf_.packptr_sub)
2937 , partptr_sub(interf_.partptr_sub)
2938 , part2packrowidx0_sub(interf_.part2packrowidx0_sub)
2939 , packindices_schur(interf_.packindices_schur)
2940 , max_partsz(interf_.max_partsz)
2941 ,
2942 // block tridiags
2943 pack_td_ptr(btdm_.pack_td_ptr)
2944 , flat_td_ptr(btdm_.flat_td_ptr)
2945 , pack_td_ptr_schur(btdm_.pack_td_ptr_schur)
2946 , A_colindsub(btdm_.A_colindsub)
2947 , internal_vector_values((internal_vector_type *)btdm_.values.data(),
2948 btdm_.values.extent(0),
2949 btdm_.values.extent(1),
2950 btdm_.values.extent(2),
2951 vector_length / internal_vector_length)
2952 , internal_vector_values_schur((internal_vector_type *)btdm_.values_schur.data(),
2953 btdm_.values_schur.extent(0),
2954 btdm_.values_schur.extent(1),
2955 btdm_.values_schur.extent(2),
2956 vector_length / internal_vector_length)
2957 , e_internal_vector_values((internal_vector_type *)btdm_.e_values.data(),
2958 btdm_.e_values.extent(0),
2959 btdm_.e_values.extent(1),
2960 btdm_.e_values.extent(2),
2961 btdm_.e_values.extent(3),
2962 vector_length / internal_vector_length)
2963 , scalar_values((btdm_scalar_type *)btdm_.values.data(),
2964 btdm_.values.extent(0),
2965 btdm_.values.extent(1),
2966 btdm_.values.extent(2),
2967 vector_length)
2968 , scalar_values_schur((btdm_scalar_type *)btdm_.values_schur.data(),
2969 btdm_.values_schur.extent(0),
2970 btdm_.values_schur.extent(1),
2971 btdm_.values_schur.extent(2),
2972 vector_length)
2973 , e_scalar_values((btdm_scalar_type *)btdm_.e_values.data(),
2974 btdm_.e_values.extent(0),
2975 btdm_.e_values.extent(1),
2976 btdm_.e_values.extent(2),
2977 btdm_.e_values.extent(3),
2978 vector_length)
2979 , d_inv(btdm_.d_inv)
2980 , diag_offsets(btdm_.diag_offsets)
2981 , blocksize(btdm_.values.extent(1))
2982 , blocksize_square(blocksize * blocksize)
2983 ,
2984 // diagonal weight to avoid zero pivots
2985 tiny(tiny_)
2986 , vector_loop_size(vector_length / internal_vector_length) {
2987 using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
2988 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
2989
2990 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_);
2991 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A_);
2992
2993 hasBlockCrsMatrix = !A_bcrs.is_null();
2994
2995 A_block_rowptr = G_->getLocalGraphDevice().row_map;
2996 if (hasBlockCrsMatrix) {
2997 A_values = const_cast<block_crs_matrix_type *>(A_bcrs.get())->getValuesDeviceNonConst();
2998 } else {
2999 A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
3000 A_values = A_crs->getLocalValuesDevice(Tpetra::Access::ReadOnly);
3001 }
3002 }
3003
3004 private:
3005 KOKKOS_INLINE_FUNCTION
3006 void
3007 extract(local_ordinal_type partidx,
3008 local_ordinal_type local_subpartidx,
3009 local_ordinal_type npacks) const {
3010#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3011 printf("extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
3012#endif
3013 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3014 const size_type kps = pack_td_ptr(partidx, local_subpartidx);
3015 local_ordinal_type kfs[vector_length] = {};
3016 local_ordinal_type ri0[vector_length] = {};
3017 local_ordinal_type nrows[vector_length] = {};
3018
3019 for (local_ordinal_type vi = 0; vi < npacks; ++vi, ++partidx) {
3020 kfs[vi] = flat_td_ptr(partidx, local_subpartidx);
3021 ri0[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 0);
3022 nrows[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 1) - ri0[vi];
3023#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3024 printf("kfs[%d] = %d;\n", vi, kfs[vi]);
3025 printf("ri0[%d] = %d;\n", vi, ri0[vi]);
3026 printf("nrows[%d] = %d;\n", vi, nrows[vi]);
3027#endif
3028 }
3029 local_ordinal_type tr_min = 0;
3030 local_ordinal_type tr_max = nrows[0];
3031 if (local_subpartidx % 2 == 1) {
3032 tr_min -= 1;
3033 tr_max += 1;
3034 }
3035#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3036 printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3037#endif
3038 for (local_ordinal_type tr = tr_min, j = 0; tr < tr_max; ++tr) {
3039 for (local_ordinal_type e = 0; e < 3; ++e) {
3040 if (hasBlockCrsMatrix) {
3041 const impl_scalar_type *block[vector_length] = {};
3042 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3043 const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
3044
3045 block[vi] = &A_values(Aj * blocksize_square);
3046 }
3047 const size_type pi = kps + j;
3048#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3049 printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
3050#endif
3051 ++j;
3052 for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
3053 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3054 const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
3055 auto &v = internal_vector_values(pi, ii, jj, 0);
3056 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3057 v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
3058 }
3059 }
3060 }
3061 } else {
3062 const size_type pi = kps + j;
3063
3064 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3065 const size_type Aj_c = A_colindsub(kfs[vi] + j);
3066
3067 for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
3068 auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr) * blocksize + ii);
3069
3070 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3071 scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c * blocksize + jj);
3072 }
3073 }
3074 }
3075 ++j;
3076 }
3077 if (nrows[0] == 1) break;
3078 if (local_subpartidx % 2 == 0) {
3079 if (e == 1 && (tr == 0 || tr + 1 == nrows[0])) break;
3080 for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3081 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr + 1 == nrows[vi])) {
3082 npacks = vi;
3083 break;
3084 }
3085 }
3086 } else {
3087 if (e == 0 && (tr == -1 || tr == nrows[0])) break;
3088 for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3089 if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3090 npacks = vi;
3091 break;
3092 }
3093 }
3094 }
3095 }
3096 }
3097 }
3098
3099 KOKKOS_INLINE_FUNCTION
3100 void
3101 extract(const member_type &member,
3102 const local_ordinal_type &partidxbeg,
3103 local_ordinal_type local_subpartidx,
3104 const local_ordinal_type &npacks,
3105 const local_ordinal_type &vbeg) const {
3106#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3107 printf("extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3108#endif
3109 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3110 local_ordinal_type kfs_vals[internal_vector_length] = {};
3111 local_ordinal_type ri0_vals[internal_vector_length] = {};
3112 local_ordinal_type nrows_vals[internal_vector_length] = {};
3113
3114 const size_type kps = pack_td_ptr(partidxbeg, local_subpartidx);
3115 for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3116 kfs_vals[vi] = flat_td_ptr(partidxbeg + vi, local_subpartidx);
3117 ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 0);
3118 nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 1) - ri0_vals[vi];
3119#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3120 printf("kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3121 printf("ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3122 printf("nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3123#endif
3124 }
3125
3126 local_ordinal_type j_vals[internal_vector_length] = {};
3127
3128 local_ordinal_type tr_min = 0;
3129 local_ordinal_type tr_max = nrows_vals[0];
3130 if (local_subpartidx % 2 == 1) {
3131 tr_min -= 1;
3132 tr_max += 1;
3133 }
3134#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3135 printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3136#endif
3137 for (local_ordinal_type tr = tr_min; tr < tr_max; ++tr) {
3138 for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3139 const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3140 if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows + 1)) {
3141 auto &j = j_vals[vi];
3142 const local_ordinal_type kfs = kfs_vals[vi];
3143 const local_ordinal_type ri0 = ri0_vals[vi];
3144 local_ordinal_type lbeg, lend;
3145 if (local_subpartidx % 2 == 0) {
3146 lbeg = (tr == tr_min ? 1 : 0);
3147 lend = (tr == nrows - 1 ? 2 : 3);
3148 } else {
3149 lbeg = 0;
3150 lend = 3;
3151 if (tr == tr_min) {
3152 lbeg = 1;
3153 lend = 2;
3154 } else if (tr == nrows) {
3155 lbeg = 0;
3156 lend = 1;
3157 }
3158 }
3159 if (hasBlockCrsMatrix) {
3160 for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3161 const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3162 const impl_scalar_type *block = &A_values(Aj * blocksize_square);
3163 const size_type pi = kps + j;
3164#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3165 printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3166#endif
3167 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3168 [&](const local_ordinal_type &ii) {
3169 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3170 scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii, jj, blocksize)]);
3171 }
3172 });
3173 }
3174 } else {
3175 for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3176 const size_type Aj_c = A_colindsub(kfs + j);
3177 const size_type pi = kps + j;
3178 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3179 [&](const local_ordinal_type &ii) {
3180 auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr) * blocksize + ii);
3181 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3182 scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c * blocksize + jj);
3183 }
3184 });
3185 }
3186 }
3187 }
3188 }
3189 }
3190 }
3191
3192 template <typename AAViewType,
3193 typename WWViewType>
3194 KOKKOS_INLINE_FUNCTION void
3195 factorize_subline(const member_type &member,
3196 const local_ordinal_type &i0,
3197 const local_ordinal_type &nrows,
3198 const local_ordinal_type &v,
3199 const AAViewType &AA,
3200 const WWViewType &WW) const {
3201 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3202
3203 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3204 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3205
3206 // constant
3207 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3208
3209#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3210 printf("i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3211#endif
3212
3213 // subview pattern
3214 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3215 KB::LU<member_type,
3216 default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3217
3218 if (nrows > 1) {
3219 auto B = A;
3220 auto C = A;
3221 local_ordinal_type i = i0;
3222 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
3223#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3224 printf("tr = %d, i = %d;\n", tr, i);
3225#endif
3226 B.assign_data(&AA(i + 1, 0, 0, v));
3227 KB::Trsm<member_type,
3228 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3229 default_mode_type, default_algo_type>::invoke(member, one, A, B);
3230 C.assign_data(&AA(i + 2, 0, 0, v));
3231 KB::Trsm<member_type,
3232 KB::Side::Right, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3233 default_mode_type, default_algo_type>::invoke(member, one, A, C);
3234 A.assign_data(&AA(i + 3, 0, 0, v));
3235
3236 member.team_barrier();
3237 KB::Gemm<member_type,
3238 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3239 default_mode_type, default_algo_type>::invoke(member, -one, C, B, one, A);
3240 KB::LU<member_type,
3241 default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3242 }
3243 } else {
3244 // for block jacobi invert a matrix here
3245 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3246 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, A, W);
3247 KB::SetIdentity<member_type, default_mode_type>::invoke(member, A);
3248 member.team_barrier();
3249 KB::Trsm<member_type,
3250 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3251 default_mode_type, default_algo_type>::invoke(member, one, W, A);
3252 KB::Trsm<member_type,
3253 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3254 default_mode_type, default_algo_type>::invoke(member, one, W, A);
3255 }
3256 }
3257
3258 public:
3259 struct ExtractAndFactorizeSubLineTag {};
3260 struct ExtractAndFactorizeFusedJacobiTag {};
3261 struct ExtractBCDTag {};
3262 struct ComputeETag {};
3263 struct ComputeSchurTag {};
3264 struct FactorizeSchurTag {};
3265
3266 KOKKOS_INLINE_FUNCTION
3267 void
3268 operator()(const ExtractAndFactorizeSubLineTag &, const member_type &member) const {
3269 // btdm is packed and sorted from largest one
3270 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3271
3272 const local_ordinal_type subpartidx = packptr_sub(packidx);
3273 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3274 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3275 const local_ordinal_type partidx = subpartidx % n_parts;
3276
3277 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3278 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3279 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3280
3281 internal_vector_scratch_type_3d_view
3282 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3283
3284#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3285 printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3286 printf("vector_loop_size = %d\n", vector_loop_size);
3287#endif
3288
3289 if (vector_loop_size == 1) {
3290 extract(partidx, local_subpartidx, npacks);
3291 factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3292 } else {
3293 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3294 [&](const local_ordinal_type &v) {
3295 const local_ordinal_type vbeg = v * internal_vector_length;
3296#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3297 printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3298#endif
3299 if (vbeg < npacks)
3300 extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3301 // this is not safe if vector loop size is different from vector size of
3302 // the team policy. we always make sure this when constructing the team policy
3303 member.team_barrier();
3304 factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3305 });
3306 }
3307 }
3308
3309 KOKKOS_INLINE_FUNCTION
3310 void
3311 operator()(const ExtractAndFactorizeFusedJacobiTag &, const member_type &member) const {
3312 using default_mode_and_algo_type = ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>;
3313 using default_mode_type = typename default_mode_and_algo_type::mode_type;
3314 using default_algo_type = typename default_mode_and_algo_type::algo_type;
3315 // When fused block Jacobi can be used, the mapping between local rows and parts is trivial (i <-> i)
3316 // We can simply pull the diagonal entry from A into d_inv
3317 btdm_scalar_scratch_type_3d_view WW1(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3318 btdm_scalar_scratch_type_3d_view WW2(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3319 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3320 const local_ordinal_type nrows = lclrow.extent(0);
3321 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, half_vector_length),
3322 [&](const local_ordinal_type &v) {
3323 local_ordinal_type row = member.league_rank() * half_vector_length + v;
3324 // diagEntry has index of diagonal within row
3325 auto W1 = Kokkos::subview(WW1, v, Kokkos::ALL(), Kokkos::ALL());
3326 auto W2 = Kokkos::subview(WW2, v, Kokkos::ALL(), Kokkos::ALL());
3327 if (row < nrows) {
3328 // View the diagonal block of A in row as 2D row-major
3329 const impl_scalar_type *A_diag = A_values.data() + diag_offsets(row);
3330 // Copy the diag into scratch slice W1
3331 // (copying elements directly is better than KokkosBatched copy)
3332 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3333 [&](int i) {
3334 W1.data()[i] = A_diag[i];
3335 });
3336 // and set W2 to identity in preparation to invert with 2 x Trsm
3337 KB::SetIdentity<member_type, default_mode_type>::invoke(member, W2);
3338 } else {
3339 // if this vector lane has no block to invert, then set W1 to identity
3340 // so that LU still has a matrix to work on. LU uses team barriers so
3341 // having some lanes run it and some not will deadlock.
3342 KB::SetIdentity<member_type, default_mode_type>::invoke(member, W1);
3343 }
3344 member.team_barrier();
3345 // LU factorize in-place
3346 KB::LU<member_type, default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, W1, tiny);
3347 member.team_barrier();
3348 KB::Trsm<member_type,
3349 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3350 default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3351 KB::Trsm<member_type,
3352 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3353 default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3354 member.team_barrier();
3355 if (row < nrows) {
3356 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3357 [&](int i) {
3358 auto d_inv_block = &d_inv(row, 0, 0);
3359 d_inv_block[i] = W2.data()[i];
3360 });
3361 }
3362 });
3363 }
3364
3365 KOKKOS_INLINE_FUNCTION
3366 void
3367 operator()(const ExtractBCDTag &, const member_type &member) const {
3368 // btdm is packed and sorted from largest one
3369 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3370 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3371 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3372
3373 const local_ordinal_type subpartidx = packptr_sub(packidx);
3374 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3375 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3376 const local_ordinal_type partidx = subpartidx % n_parts;
3377
3378 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3379 // const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3380 // const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3381
3382 if (vector_loop_size == 1) {
3383 extract(partidx, local_subpartidx, npacks);
3384 } else {
3385 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3386 [&](const local_ordinal_type &v) {
3387 const local_ordinal_type vbeg = v * internal_vector_length;
3388#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3389 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3390 printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3391#endif
3392 if (vbeg < npacks)
3393 extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3394 });
3395 }
3396
3397 member.team_barrier();
3398
3399 const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3400 const size_type kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 1;
3401
3402 const local_ordinal_type r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3403 const local_ordinal_type r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3404
3405#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3406 printf("Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3407#endif
3408
3409 // Need to copy D to e_internal_vector_values.
3410 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3411 Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3412
3413 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3414 Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3415 }
3416
3417 KOKKOS_INLINE_FUNCTION
3418 void
3419 operator()(const ComputeETag &, const member_type &member) const {
3420 // btdm is packed and sorted from largest one
3421 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3422
3423 const local_ordinal_type subpartidx = packptr_sub(packidx);
3424 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3425 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3426 const local_ordinal_type partidx = subpartidx % n_parts;
3427
3428 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3429 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3430 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
3431 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3432 const local_ordinal_type num_vectors = blocksize;
3433
3434 (void)npacks;
3435
3436 internal_vector_scratch_type_3d_view
3437 WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3438 if (local_subpartidx == 0) {
3439 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3440 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3441 });
3442 } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
3443 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3444 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3445 });
3446 } else {
3447 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3448 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3449 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3450 });
3451 }
3452 }
3453
3454 KOKKOS_INLINE_FUNCTION
3455 void
3456 operator()(const ComputeSchurTag &, const member_type &member) const {
3457 // btdm is packed and sorted from largest one
3458 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3459 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3460 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3461
3462 const local_ordinal_type subpartidx = packptr_sub(packidx);
3463 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3464 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3465 const local_ordinal_type partidx = subpartidx % n_parts;
3466
3467 // const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3468 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3469 // const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3470 // const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3471
3472 // Compute S = D - C E
3473
3474 const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
3475 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
3476 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
3477
3478 for (local_ordinal_type i = 0; i < 4; ++i) { // pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-i0_schur
3479 copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3480 Kokkos::subview(internal_vector_values, i0_offset + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3481 }
3482
3483 member.team_barrier();
3484
3485 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3486
3487 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx) + 1;
3488 const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 2;
3489
3490 const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3491 const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3492
3493 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3494
3495 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3496 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3497
3498 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3499 for (size_type i = 0; i < pack_td_ptr_schur(partidx, local_subpartidx_schur + 1) - pack_td_ptr_schur(partidx, local_subpartidx_schur); ++i) {
3500 local_ordinal_type e_r, e_c, c_kps;
3501
3502 if (local_subpartidx_schur == 0) {
3503 if (i == 0) {
3504 e_r = e_r1;
3505 e_c = 0;
3506 c_kps = c_kps1;
3507 } else if (i == 3) {
3508 e_r = e_r2;
3509 e_c = 1;
3510 c_kps = c_kps2;
3511 } else if (i == 4) {
3512 e_r = e_r2;
3513 e_c = 0;
3514 c_kps = c_kps2;
3515 } else {
3516 continue;
3517 }
3518 } else {
3519 if (i == 0) {
3520 e_r = e_r1;
3521 e_c = 1;
3522 c_kps = c_kps1;
3523 } else if (i == 1) {
3524 e_r = e_r1;
3525 e_c = 0;
3526 c_kps = c_kps1;
3527 } else if (i == 4) {
3528 e_r = e_r2;
3529 e_c = 1;
3530 c_kps = c_kps2;
3531 } else if (i == 5) {
3532 e_r = e_r2;
3533 e_c = 0;
3534 c_kps = c_kps2;
3535 } else {
3536 continue;
3537 }
3538 }
3539
3540 auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx, local_subpartidx_schur) + i, Kokkos::ALL(), Kokkos::ALL(), v);
3541 auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3542 auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3543 KB::Gemm<member_type,
3544 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3545 default_mode_type, default_algo_type>::invoke(member, -one, C, E, one, S);
3546 }
3547 });
3548 }
3549
3550 KOKKOS_INLINE_FUNCTION
3551 void
3552 operator()(const FactorizeSchurTag &, const member_type &member) const {
3553 const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3554
3555 const local_ordinal_type subpartidx = packptr_sub(packidx);
3556
3557 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3558 const local_ordinal_type partidx = subpartidx % n_parts;
3559
3560 const local_ordinal_type i0 = pack_td_ptr_schur(partidx, 0);
3561 const local_ordinal_type nrows = 2 * (pack_td_ptr_schur.extent(1) - 1);
3562
3563 internal_vector_scratch_type_3d_view
3564 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3565
3566#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3567 printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3568#endif
3569
3570 if (vector_loop_size == 1) {
3571 factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3572 } else {
3573 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3574 [&](const local_ordinal_type &v) {
3575 factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3576 });
3577 }
3578 }
3579
3580 void run() {
3581 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3582 const local_ordinal_type team_size =
3583 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3584 recommended_team_size(blocksize, vector_length, internal_vector_length);
3585 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3586 shmem_size(blocksize, blocksize, vector_loop_size);
3587
3588 {
3589#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3590 printf("Start ExtractAndFactorizeSubLineTag\n");
3591#endif
3592 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3593 Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeSubLineTag>
3594 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3595
3596 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3597 writeBTDValuesToFile(n_parts, scalar_values, "before.mm");
3598
3599 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3600 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3601 policy, *this);
3602 execution_space().fence();
3603
3604 writeBTDValuesToFile(n_parts, scalar_values, "after.mm");
3605#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3606 printf("End ExtractAndFactorizeSubLineTag\n");
3607#endif
3608 }
3609
3610 if (packindices_schur.extent(1) > 0) {
3611 {
3612#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3613 printf("Start ExtractBCDTag\n");
3614#endif
3615 Kokkos::deep_copy(e_scalar_values, KokkosKernels::ArithTraits<btdm_magnitude_type>::zero());
3616 Kokkos::deep_copy(scalar_values_schur, KokkosKernels::ArithTraits<btdm_magnitude_type>::zero());
3617
3618 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_before_extract.mm");
3619
3620 {
3621 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3622 Kokkos::TeamPolicy<execution_space, ExtractBCDTag>
3623 policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3624
3625 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3626 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3627 policy, *this);
3628 execution_space().fence();
3629 }
3630
3631#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3632 printf("End ExtractBCDTag\n");
3633#endif
3634 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values, "after_extraction_of_BCD.mm");
3635#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3636 printf("Start ComputeETag\n");
3637#endif
3638 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_extract.mm");
3639 {
3640 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3641 Kokkos::TeamPolicy<execution_space, ComputeETag>
3642 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3643
3644 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3645 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3646 policy, *this);
3647 execution_space().fence();
3648 }
3649 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_compute.mm");
3650
3651#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3652 printf("End ComputeETag\n");
3653#endif
3654 }
3655
3656 {
3657#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3658 printf("Start ComputeSchurTag\n");
3659#endif
3660 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3661 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "before_schur.mm");
3662 Kokkos::TeamPolicy<execution_space, ComputeSchurTag>
3663 policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3664
3665 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3666 policy, *this);
3667 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm");
3668 execution_space().fence();
3669#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3670 printf("End ComputeSchurTag\n");
3671#endif
3672 }
3673
3674 {
3675#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3676 printf("Start FactorizeSchurTag\n");
3677#endif
3678 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3679 Kokkos::TeamPolicy<execution_space, FactorizeSchurTag>
3680 policy(packindices_schur.extent(0), team_size, vector_loop_size);
3681 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3682 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3683 policy, *this);
3684 execution_space().fence();
3685 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_factor_schur.mm");
3686#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3687 printf("End FactorizeSchurTag\n");
3688#endif
3689 }
3690 }
3691
3692 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3693 }
3694
3695 void run_fused_jacobi() {
3696 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3697 const local_ordinal_type team_size =
3698 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3699 recommended_team_size(blocksize, half_vector_length, 1);
3700 const local_ordinal_type per_team_scratch =
3701 btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * half_vector_length);
3702 {
3703 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeFusedJacobi", ExtractAndFactorizeFusedJacobiTag);
3704 Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeFusedJacobiTag>
3705 policy((lclrow.extent(0) + half_vector_length - 1) / half_vector_length, team_size, half_vector_length);
3706
3707 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3708 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeFusedJacobiTag>",
3709 policy, *this);
3710 }
3711 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3712 }
3713};
3714
3718template <typename MatrixType>
3719void performNumericPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
3720 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3721 const BlockHelperDetails::PartInterface<MatrixType> &interf,
3723 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny,
3724 bool use_fused_jacobi) {
3726 using execution_space = typename impl_type::execution_space;
3727 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3728 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3729 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
3730
3731 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase", NumericPhase);
3732
3733 int blocksize = btdm.values.extent(1);
3734 // Both Kokkos policy vector length and SIMD type vector length are hardcoded in KokkosBatched.
3735 // For large block sizes, have to fall back to level 1 scratch.
3736 int scratch_required;
3737 if (!use_fused_jacobi) {
3738 // General path scratch requirement
3739 scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3740 } else {
3741 // Block Jacobi scratch requirement: measured in scalars, and uses twice as much (in bytes) per vector lane as the general path.
3742 scratch_required = btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * impl_type::half_vector_length);
3743 }
3744
3745 int max_scratch = team_policy_type::scratch_size_max(0);
3746
3747 if (scratch_required < max_scratch) {
3748 // Can use level 0 scratch
3749 ExtractAndFactorizeTridiags<MatrixType, 0> function(btdm, interf, A, G, tiny);
3750 if (!use_fused_jacobi)
3751 function.run();
3752 else
3753 function.run_fused_jacobi();
3754 } else {
3755 // Not enough level 0 scratch, so fall back to level 1
3756 ExtractAndFactorizeTridiags<MatrixType, 1> function(btdm, interf, A, G, tiny);
3757 if (!use_fused_jacobi)
3758 function.run();
3759 else
3760 function.run_fused_jacobi();
3761 }
3762 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3763}
3764
3768template <typename MatrixType>
3770 public:
3772 using execution_space = typename impl_type::execution_space;
3773 using memory_space = typename impl_type::memory_space;
3774
3775 using local_ordinal_type = typename impl_type::local_ordinal_type;
3776 using impl_scalar_type = typename impl_type::impl_scalar_type;
3777 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3778 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3779 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3780 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3781 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
3782 using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
3783 static constexpr int vector_length = impl_type::vector_length;
3784
3785 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3786
3787 private:
3788 // part interface
3789 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3790 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3791 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3792 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3793 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3794 const local_ordinal_type blocksize;
3795 const local_ordinal_type num_vectors;
3796
3797 // packed multivector output (or input)
3798 vector_type_3d_view packed_multivector;
3799 const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3800
3801 template <typename TagType>
3802 KOKKOS_INLINE_FUNCTION void copy_multivectors(const local_ordinal_type &j,
3803 const local_ordinal_type &vi,
3804 const local_ordinal_type &pri,
3805 const local_ordinal_type &ri0) const {
3806 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3807 for (local_ordinal_type i = 0; i < blocksize; ++i)
3808 packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3809 }
3810
3811 public:
3812 MultiVectorConverter(const BlockHelperDetails::PartInterface<MatrixType> &interf,
3813 const vector_type_3d_view &pmv)
3814 : partptr(interf.partptr)
3815 , packptr(interf.packptr)
3816 , part2packrowidx0(interf.part2packrowidx0)
3817 , part2rowidx0(interf.part2rowidx0)
3818 , lclrow(interf.lclrow)
3819 , blocksize(pmv.extent(1))
3820 , num_vectors(pmv.extent(2))
3821 , packed_multivector(pmv) {}
3822
3823 // TODO:: modify this routine similar to the team level functions
3824 KOKKOS_INLINE_FUNCTION
3825 void
3826 operator()(const local_ordinal_type &packidx) const {
3827 local_ordinal_type partidx = packptr(packidx);
3828 local_ordinal_type npacks = packptr(packidx + 1) - partidx;
3829 const local_ordinal_type pri0 = part2packrowidx0(partidx);
3830
3831 local_ordinal_type ri0[vector_length] = {};
3832 local_ordinal_type nrows[vector_length] = {};
3833 for (local_ordinal_type v = 0; v < npacks; ++v, ++partidx) {
3834 ri0[v] = part2rowidx0(partidx);
3835 nrows[v] = part2rowidx0(partidx + 1) - ri0[v];
3836 }
3837 for (local_ordinal_type j = 0; j < nrows[0]; ++j) {
3838 local_ordinal_type cnt = 1;
3839 for (; cnt < npacks && j != nrows[cnt]; ++cnt)
3840 ;
3841 npacks = cnt;
3842 const local_ordinal_type pri = pri0 + j;
3843 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3844 for (local_ordinal_type i = 0; i < blocksize; ++i)
3845 for (local_ordinal_type v = 0; v < npacks; ++v)
3846 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0[v] + j) + i, col));
3847 }
3848 }
3849
3850 KOKKOS_INLINE_FUNCTION
3851 void
3852 operator()(const member_type &member) const {
3853 const local_ordinal_type packidx = member.league_rank();
3854 const local_ordinal_type partidx_begin = packptr(packidx);
3855 const local_ordinal_type npacks = packptr(packidx + 1) - partidx_begin;
3856 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3857 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
3858 const local_ordinal_type partidx = partidx_begin + v;
3859 const local_ordinal_type ri0 = part2rowidx0(partidx);
3860 const local_ordinal_type nrows = part2rowidx0(partidx + 1) - ri0;
3861
3862 if (nrows == 1) {
3863 const local_ordinal_type pri = pri0;
3864 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
3865 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
3866 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0) + i, col));
3867 });
3868 }
3869 } else {
3870 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
3871 const local_ordinal_type pri = pri0 + j;
3872 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3873 for (local_ordinal_type i = 0; i < blocksize; ++i)
3874 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3875 });
3876 }
3877 });
3878 }
3879
3880 void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3881 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3882 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3883
3884 scalar_multivector = scalar_multivector_;
3885 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3886 const local_ordinal_type vl = vector_length;
3887 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3888 Kokkos::parallel_for("MultiVectorConverter::TeamPolicy", policy, *this);
3889 } else {
3890 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3891 Kokkos::parallel_for("MultiVectorConverter::RangePolicy", policy, *this);
3892 }
3893 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3894 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3895 }
3896};
3897
3901
3902template <>
3903struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3904 typedef KB::Mode::Serial mode_type;
3905 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3906#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3907 typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3908#else
3909 typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3910#endif
3911 static int recommended_team_size(const int /* blksize */,
3912 const int /* vector_length */,
3913 const int /* internal_vector_length */) {
3914 return 1;
3915 }
3916};
3917
3918#if defined(KOKKOS_ENABLE_CUDA)
3919static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
3920 const int vector_length,
3921 const int internal_vector_length) {
3922 const int vector_size = vector_length / internal_vector_length;
3923 int total_team_size(0);
3924 if (blksize <= 5)
3925 total_team_size = 32;
3926 else if (blksize <= 9)
3927 total_team_size = 32; // 64
3928 else if (blksize <= 12)
3929 total_team_size = 96;
3930 else if (blksize <= 16)
3931 total_team_size = 128;
3932 else if (blksize <= 20)
3933 total_team_size = 160;
3934 else
3935 total_team_size = 160;
3936 return total_team_size / vector_size;
3937}
3938
3939template <>
3940struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3941 typedef KB::Mode::Team mode_type;
3942 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3943 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3944 static int recommended_team_size(const int blksize,
3945 const int vector_length,
3946 const int internal_vector_length) {
3947 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3948 }
3949};
3950template <>
3951struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3952 typedef KB::Mode::Team mode_type;
3953 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3954 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3955 static int recommended_team_size(const int blksize,
3956 const int vector_length,
3957 const int internal_vector_length) {
3958 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3959 }
3960};
3961#endif
3962
3963#if defined(KOKKOS_ENABLE_HIP)
3964static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
3965 const int vector_length,
3966 const int internal_vector_length) {
3967 const int vector_size = vector_length / internal_vector_length;
3968 int total_team_size(0);
3969 if (blksize <= 5)
3970 total_team_size = 32;
3971 else if (blksize <= 9)
3972 total_team_size = 32; // 64
3973 else if (blksize <= 12)
3974 total_team_size = 96;
3975 else if (blksize <= 16)
3976 total_team_size = 128;
3977 else if (blksize <= 20)
3978 total_team_size = 160;
3979 else
3980 total_team_size = 160;
3981 return total_team_size / vector_size;
3982}
3983
3984template <>
3985struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
3986 typedef KB::Mode::Team mode_type;
3987 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3988 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3989 static int recommended_team_size(const int blksize,
3990 const int vector_length,
3991 const int internal_vector_length) {
3992 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3993 }
3994};
3995template <>
3996struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
3997 typedef KB::Mode::Team mode_type;
3998 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3999 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4000 static int recommended_team_size(const int blksize,
4001 const int vector_length,
4002 const int internal_vector_length) {
4003 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
4004 }
4005};
4006#endif
4007
4008#if defined(KOKKOS_ENABLE_SYCL)
4009static inline int SolveTridiagsRecommendedSYCLTeamSize(const int blksize,
4010 const int vector_length,
4011 const int internal_vector_length) {
4012 const int vector_size = vector_length / internal_vector_length;
4013 int total_team_size(0);
4014 if (blksize <= 5)
4015 total_team_size = 32;
4016 else if (blksize <= 9)
4017 total_team_size = 32; // 64
4018 else if (blksize <= 12)
4019 total_team_size = 96;
4020 else if (blksize <= 16)
4021 total_team_size = 128;
4022 else if (blksize <= 20)
4023 total_team_size = 160;
4024 else
4025 total_team_size = 160;
4026 return total_team_size / vector_size;
4027}
4028
4029template <>
4030struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
4031 typedef KB::Mode::Team mode_type;
4032 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4033 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4034 static int recommended_team_size(const int blksize,
4035 const int vector_length,
4036 const int internal_vector_length) {
4037 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4038 }
4039};
4040template <>
4041struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
4042 typedef KB::Mode::Team mode_type;
4043 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4044 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4045 static int recommended_team_size(const int blksize,
4046 const int vector_length,
4047 const int internal_vector_length) {
4048 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4049 }
4050};
4051#endif
4052
4053template <typename MatrixType>
4054struct SolveTridiags {
4055 public:
4056 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4057 using execution_space = typename impl_type::execution_space;
4058
4059 using local_ordinal_type = typename impl_type::local_ordinal_type;
4060 using size_type = typename impl_type::size_type;
4061 using impl_scalar_type = typename impl_type::impl_scalar_type;
4062 using magnitude_type = typename impl_type::magnitude_type;
4063 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
4064 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
4066 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4067 using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
4068 using size_type_2d_view = typename impl_type::size_type_2d_view;
4070 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4071 using internal_vector_type_3d_view = typename impl_type::internal_vector_type_3d_view;
4072 using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
4073 using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
4074 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
4075
4076 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
4077
4078 using internal_vector_type = typename impl_type::internal_vector_type;
4079 static constexpr int vector_length = impl_type::vector_length;
4080 static constexpr int internal_vector_length = impl_type::internal_vector_length;
4081
4083 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4084 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
4085
4087 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
4088 using member_type = typename team_policy_type::member_type;
4089
4090 private:
4091 // part interface
4092 local_ordinal_type n_subparts_per_part;
4093 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
4094 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
4095 const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
4096 const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
4097 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
4098 const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
4099 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
4100 const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
4101
4102 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
4103 const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
4104
4105 // block tridiags
4106 const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
4107
4108 // block tridiags values
4109 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
4110 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
4111 const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
4112
4113 const Unmanaged<internal_vector_type_3d_view> X_internal_vector_values_schur;
4114
4115 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
4116 const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
4117
4118 const local_ordinal_type vector_loop_size;
4119
4120 // copy to multivectors : damping factor and Y_scalar_multivector
4121 Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
4122#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
4123 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4124#else
4125 /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4126#endif
4127 const impl_scalar_type df;
4128 const bool compute_diff;
4129 // Schur solve only supports solving one vector at a time (currently).
4130 // If solving on a multivector, we loop over each vec in the solve.
4131 // This is the current vec being solved.
4132 local_ordinal_type active_schur_solve_vec;
4133
4134 public:
4135 SolveTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf,
4136 const BlockTridiags<MatrixType> &btdm,
4137 const vector_type_3d_view &pmv,
4138 const impl_scalar_type damping_factor,
4139 const bool is_norm_manager_active)
4140 : // interface
4141 n_subparts_per_part(interf.n_subparts_per_part)
4142 , partptr(interf.partptr)
4143 , packptr(interf.packptr)
4144 , packindices_sub(interf.packindices_sub)
4145 , packindices_schur(interf.packindices_schur)
4146 , part2packrowidx0(interf.part2packrowidx0)
4147 , part2packrowidx0_sub(interf.part2packrowidx0_sub)
4148 , lclrow(interf.lclrow)
4149 , packptr_sub(interf.packptr_sub)
4150 , partptr_sub(interf.partptr_sub)
4151 , pack_td_ptr_schur(btdm.pack_td_ptr_schur)
4152 ,
4153 // block tridiags and multivector
4154 pack_td_ptr(btdm.pack_td_ptr)
4155 , D_internal_vector_values((internal_vector_type *)btdm.values.data(),
4156 btdm.values.extent(0),
4157 btdm.values.extent(1),
4158 btdm.values.extent(2),
4159 vector_length / internal_vector_length)
4160 , X_internal_vector_values((internal_vector_type *)pmv.data(),
4161 pmv.extent(0),
4162 pmv.extent(1),
4163 pmv.extent(2),
4164 vector_length / internal_vector_length)
4165 , X_internal_scalar_values((btdm_scalar_type *)pmv.data(),
4166 pmv.extent(0),
4167 pmv.extent(1),
4168 pmv.extent(2),
4169 vector_length)
4170 , X_internal_vector_values_schur(btdm.X_internal_vector_values_schur)
4171 , D_internal_vector_values_schur((internal_vector_type *)btdm.values_schur.data(),
4172 btdm.values_schur.extent(0),
4173 btdm.values_schur.extent(1),
4174 btdm.values_schur.extent(2),
4175 vector_length / internal_vector_length)
4176 , e_internal_vector_values((internal_vector_type *)btdm.e_values.data(),
4177 btdm.e_values.extent(0),
4178 btdm.e_values.extent(1),
4179 btdm.e_values.extent(2),
4180 btdm.e_values.extent(3),
4181 vector_length / internal_vector_length)
4182 , vector_loop_size(vector_length / internal_vector_length)
4183 , Y_scalar_multivector()
4184 , Z_scalar_vector()
4185 , df(damping_factor)
4186 , compute_diff(is_norm_manager_active)
4187 , active_schur_solve_vec(0) {}
4188
4189 public:
4191 KOKKOS_INLINE_FUNCTION
4192 void
4193 copyToFlatMultiVector(const member_type &member,
4194 const local_ordinal_type partidxbeg, // partidx for v = 0
4195 const local_ordinal_type npacks,
4196 const local_ordinal_type pri0,
4197 const local_ordinal_type v, // index with a loop of vector_loop_size
4198 const local_ordinal_type blocksize,
4199 const local_ordinal_type num_vectors) const {
4200 const local_ordinal_type vbeg = v * internal_vector_length;
4201 if (vbeg < npacks) {
4202 local_ordinal_type ri0_vals[internal_vector_length] = {};
4203 local_ordinal_type nrows_vals[internal_vector_length] = {};
4204 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4205 const local_ordinal_type partidx = partidxbeg + vv;
4206 ri0_vals[vi] = partptr(partidx);
4207 nrows_vals[vi] = partptr(partidx + 1) - ri0_vals[vi];
4208 }
4209
4210 impl_scalar_type z_partial_sum(0);
4211 if (nrows_vals[0] == 1) {
4212 const local_ordinal_type j = 0, pri = pri0;
4213 {
4214 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4215 const local_ordinal_type ri0 = ri0_vals[vi];
4216 const local_ordinal_type nrows = nrows_vals[vi];
4217 if (j < nrows) {
4218 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
4219 [&](const local_ordinal_type &i) {
4220 const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4221 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4222 impl_scalar_type &y = Y_scalar_multivector(row, col);
4223 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4224 y += df * yd;
4225
4226 { // if (compute_diff) {
4227 const auto yd_abs = KokkosKernels::ArithTraits<impl_scalar_type>::abs(yd);
4228 z_partial_sum += yd_abs * yd_abs;
4229 }
4230 }
4231 });
4232 }
4233 }
4234 }
4235 } else {
4236 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows_vals[0]),
4237 [&](const local_ordinal_type &j) {
4238 const local_ordinal_type pri = pri0 + j;
4239 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4240 const local_ordinal_type ri0 = ri0_vals[vi];
4241 const local_ordinal_type nrows = nrows_vals[vi];
4242 if (j < nrows) {
4243 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4244 for (local_ordinal_type i = 0; i < blocksize; ++i) {
4245 const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4246 impl_scalar_type &y = Y_scalar_multivector(row, col);
4247 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4248 y += df * yd;
4249
4250 { // if (compute_diff) {
4251 const auto yd_abs = KokkosKernels::ArithTraits<impl_scalar_type>::abs(yd);
4252 z_partial_sum += yd_abs * yd_abs;
4253 }
4254 }
4255 }
4256 }
4257 }
4258 });
4259 }
4260 // if (compute_diff)
4261 Z_scalar_vector(member.league_rank()) += z_partial_sum;
4262 }
4263 }
4264
4268 template <typename WWViewType>
4269 KOKKOS_INLINE_FUNCTION void
4270 solveSingleVector(const member_type &member,
4271 const local_ordinal_type &blocksize,
4272 const local_ordinal_type &i0,
4273 const local_ordinal_type &r0,
4274 const local_ordinal_type &nrows,
4275 const local_ordinal_type &v,
4276 const WWViewType &WW) const {
4277 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4278
4279 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4280 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4281
4282 // base pointers
4283 auto A = D_internal_vector_values.data();
4284 auto X = X_internal_vector_values.data();
4285
4286 // constant
4287 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4288 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
4289 // const local_ordinal_type num_vectors = X_scalar_values.extent(2);
4290
4291 // const local_ordinal_type blocksize = D_scalar_values.extent(1);
4292 const local_ordinal_type astep = D_internal_vector_values.stride(0);
4293 const local_ordinal_type as0 = D_internal_vector_values.stride(1); // blocksize*vector_length;
4294 const local_ordinal_type as1 = D_internal_vector_values.stride(2); // vector_length;
4295 const local_ordinal_type xstep = X_internal_vector_values.stride(0);
4296 const local_ordinal_type xs0 = X_internal_vector_values.stride(1); // vector_length;
4297
4298 // move to starting point
4299 A += i0 * astep + v;
4300 X += r0 * xstep + v;
4301
4302 // for (local_ordinal_type col=0;col<num_vectors;++col)
4303 if (nrows > 1) {
4304 // solve Lx = x
4305 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4306 member,
4307 KB::Diag::Unit,
4308 blocksize, blocksize,
4309 one,
4310 A, as0, as1,
4311 X, xs0);
4312
4313 for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
4314 member.team_barrier();
4315 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4316 member,
4317 blocksize, blocksize,
4318 -one,
4319 A + 2 * astep, as0, as1,
4320 X, xs0,
4321 one,
4322 X + 1 * xstep, xs0);
4323 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4324 member,
4325 KB::Diag::Unit,
4326 blocksize, blocksize,
4327 one,
4328 A + 3 * astep, as0, as1,
4329 X + 1 * xstep, xs0);
4330
4331 A += 3 * astep;
4332 X += 1 * xstep;
4333 }
4334
4335 // solve Ux = x
4336 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4337 member,
4338 KB::Diag::NonUnit,
4339 blocksize, blocksize,
4340 one,
4341 A, as0, as1,
4342 X, xs0);
4343
4344 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4345 A -= 3 * astep;
4346 member.team_barrier();
4347 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4348 member,
4349 blocksize, blocksize,
4350 -one,
4351 A + 1 * astep, as0, as1,
4352 X, xs0,
4353 one,
4354 X - 1 * xstep, xs0);
4355 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4356 member,
4357 KB::Diag::NonUnit,
4358 blocksize, blocksize,
4359 one,
4360 A, as0, as1,
4361 X - 1 * xstep, xs0);
4362 X -= 1 * xstep;
4363 }
4364 // for multiple rhs
4365 // X += xs1;
4366 } else {
4367 const local_ordinal_type ws0 = WW.stride(0);
4368 auto W = WW.data() + v;
4369 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
4370 member, blocksize, X, xs0, W, ws0);
4371 member.team_barrier();
4372 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4373 member,
4374 blocksize, blocksize,
4375 one,
4376 A, as0, as1,
4377 W, xs0,
4378 zero,
4379 X, xs0);
4380 }
4381 }
4382
4383 template <typename WWViewType>
4384 KOKKOS_INLINE_FUNCTION void
4385 solveMultiVector(const member_type &member,
4386 const local_ordinal_type & /* blocksize */,
4387 const local_ordinal_type &i0,
4388 const local_ordinal_type &r0,
4389 const local_ordinal_type &nrows,
4390 const local_ordinal_type &v,
4391 const WWViewType &WW) const {
4392 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4393
4394 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4395 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4396
4397 // constant
4398 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4399 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
4400
4401 // subview pattern
4402 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4403 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4404 auto X2 = X1;
4405
4406 local_ordinal_type i = i0, r = r0;
4407
4408 if (nrows > 1) {
4409 // solve Lx = x
4410 KB::Trsm<member_type,
4411 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4412 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4413 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
4414 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
4415 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
4416 member.team_barrier();
4417 KB::Gemm<member_type,
4418 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4419 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4420 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
4421 KB::Trsm<member_type,
4422 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4423 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4424 X1.assign_data(X2.data());
4425 }
4426
4427 // solve Ux = x
4428 KB::Trsm<member_type,
4429 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4430 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4431 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4432 i -= 3;
4433 A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
4434 X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
4435 member.team_barrier();
4436 KB::Gemm<member_type,
4437 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4438 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4439
4440 A.assign_data(&D_internal_vector_values(i, 0, 0, v));
4441 KB::Trsm<member_type,
4442 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4443 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4444 X1.assign_data(X2.data());
4445 }
4446 } else {
4447 // matrix is already inverted
4448 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4449 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
4450 member.team_barrier();
4451 KB::Gemm<member_type,
4452 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4453 default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
4454 }
4455 }
4456
4457 template <int B>
4458 struct SingleVectorTag {};
4459 template <int B>
4460 struct MultiVectorTag {};
4461
4462 template <int B>
4463 struct SingleVectorSubLineTag {};
4464 template <int B>
4465 struct SingleVectorApplyCTag {};
4466 template <int B>
4467 struct SingleVectorSchurTag {};
4468 template <int B>
4469 struct SingleVectorApplyETag {};
4470 template <int B>
4471 struct CopyVectorToFlatTag {};
4472 template <int B>
4473 struct SingleZeroingTag {};
4474
4475 template <int B>
4476 KOKKOS_INLINE_FUNCTION void
4477 operator()(const SingleVectorTag<B> &, const member_type &member) const {
4478 const local_ordinal_type packidx = member.league_rank();
4479 const local_ordinal_type partidx = packptr(packidx);
4480 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4481 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4482 const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4483 const local_ordinal_type r0 = part2packrowidx0(partidx);
4484 const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4485 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4486 const local_ordinal_type num_vectors = 1;
4487 internal_vector_scratch_type_3d_view
4488 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4489 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4490 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4491 });
4492 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4493 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4494 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4495 });
4496 }
4497
4498 template <int B>
4499 KOKKOS_INLINE_FUNCTION void
4500 operator()(const MultiVectorTag<B> &, const member_type &member) const {
4501 const local_ordinal_type packidx = member.league_rank();
4502 const local_ordinal_type partidx = packptr(packidx);
4503 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4504 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4505 const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4506 const local_ordinal_type r0 = part2packrowidx0(partidx);
4507 const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4508 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4509 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4510
4511 internal_vector_scratch_type_3d_view
4512 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4513 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4514 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4515 });
4516 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4517 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4518 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4519 });
4520 }
4521
4522 template <int B>
4523 KOKKOS_INLINE_FUNCTION void
4524 operator()(const SingleVectorSubLineTag<B> &, const member_type &member) const {
4525 // btdm is packed and sorted from largest one
4526 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4527
4528 const local_ordinal_type subpartidx = packptr_sub(packidx);
4529 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4530 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4531 const local_ordinal_type partidx = subpartidx % n_parts;
4532
4533 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
4534 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4535 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4536 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4537 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4538
4539 //(void) i0;
4540 //(void) nrows;
4541 (void)npacks;
4542
4543 internal_vector_scratch_type_3d_view
4544 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4545
4546 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4547 auto X_internal_vec = Kokkos::subview(X_internal_vector_values, Kokkos::ALL(), Kokkos::ALL(), active_schur_solve_vec, Kokkos::ALL());
4548 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vec, WW);
4549 });
4550 }
4551
4552 template <int B>
4553 KOKKOS_INLINE_FUNCTION void
4554 operator()(const SingleVectorApplyCTag<B> &, const member_type &member) const {
4555 // btdm is packed and sorted from largest one
4556 // const local_ordinal_type packidx = packindices_schur(member.league_rank());
4557 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4558
4559 const local_ordinal_type subpartidx = packptr_sub(packidx);
4560 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4561 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4562 const local_ordinal_type partidx = subpartidx % n_parts;
4563 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4564
4565 // const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4566 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4567 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4568 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4569
4570 // Compute v_2 = v_2 - C v_1
4571
4572 const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
4573 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
4574 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
4575
4576 (void)i0_schur;
4577 (void)i0_offset;
4578
4579 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4580
4581 const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx) - 2 : 0;
4582 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx + 1) + 1;
4583
4584 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4585
4586 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4587 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4588
4589 if (local_subpartidx == 0) {
4590 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4591 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4592 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4593 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4594
4595 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4596 member,
4597 blocksize, blocksize,
4598 -one,
4599 C.data(), C.stride(0), C.stride(1),
4600 v_1.data(), v_1.stride(0),
4601 one,
4602 v_2.data(), v_2.stride(0));
4603 });
4604 } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4605 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4606 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), active_schur_solve_vec, v);
4607 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4608 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4609
4610 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4611 member,
4612 blocksize, blocksize,
4613 -one,
4614 C.data(), C.stride(0), C.stride(1),
4615 v_1.data(), v_1.stride(0),
4616 one,
4617 v_2.data(), v_2.stride(0));
4618 });
4619 } else {
4620 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4621 {
4622 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4623 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4624 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4625
4626 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4627 member,
4628 blocksize, blocksize,
4629 -one,
4630 C.data(), C.stride(0), C.stride(1),
4631 v_1.data(), v_1.stride(0),
4632 one,
4633 v_2.data(), v_2.stride(0));
4634 }
4635 {
4636 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), active_schur_solve_vec, v);
4637 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4638 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4639
4640 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4641 member,
4642 blocksize, blocksize,
4643 -one,
4644 C.data(), C.stride(0), C.stride(1),
4645 v_1.data(), v_1.stride(0),
4646 one,
4647 v_2.data(), v_2.stride(0));
4648 }
4649 });
4650 }
4651 }
4652
4653 template <int B>
4654 KOKKOS_INLINE_FUNCTION void
4655 operator()(const SingleVectorSchurTag<B> &, const member_type &member) const {
4656 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4657
4658 const local_ordinal_type partidx = packptr_sub(packidx);
4659
4660 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4661
4662 const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx, 0);
4663 const local_ordinal_type nrows = 2 * (n_subparts_per_part - 1);
4664
4665 const local_ordinal_type r0_schur = nrows * member.league_rank();
4666
4667 internal_vector_scratch_type_3d_view
4668 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4669
4670 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4671 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4672 for (local_ordinal_type i = 0; i < 2; ++i) {
4673 copy3DView<local_ordinal_type>(member,
4674 Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL()),
4675 Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), active_schur_solve_vec, Kokkos::ALL()));
4676 }
4677 }
4678
4679 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4680 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4681 });
4682
4683 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4684 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4685 for (local_ordinal_type i = 0; i < 2; ++i) {
4686 copy3DView<local_ordinal_type>(member,
4687 Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), active_schur_solve_vec, Kokkos::ALL()),
4688 Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL()));
4689 }
4690 }
4691 }
4692
4693 template <int B>
4694 KOKKOS_INLINE_FUNCTION void
4695 operator()(const SingleVectorApplyETag<B> &, const member_type &member) const {
4696 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4697
4698 const local_ordinal_type subpartidx = packptr_sub(packidx);
4699 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4700 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4701 const local_ordinal_type partidx = subpartidx % n_parts;
4702 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4703
4704 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4705 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4706
4707 // Compute v_2 = v_2 - C v_1
4708
4709 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4710
4711 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4712
4713 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4714 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4715
4716 if (local_subpartidx == 0) {
4717 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4718 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4719
4720 for (local_ordinal_type row = 0; row < nrows; ++row) {
4721 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4722 auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4723
4724 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4725 member,
4726 blocksize, blocksize,
4727 -one,
4728 E.data(), E.stride(0), E.stride(1),
4729 v_2.data(), v_2.stride(0),
4730 one,
4731 v_1.data(), v_1.stride(0));
4732 }
4733 });
4734 } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4735 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4736 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4737
4738 for (local_ordinal_type row = 0; row < nrows; ++row) {
4739 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4740 auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4741
4742 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4743 member,
4744 blocksize, blocksize,
4745 -one,
4746 E.data(), E.stride(0), E.stride(1),
4747 v_2.data(), v_2.stride(0),
4748 one,
4749 v_1.data(), v_1.stride(0));
4750 }
4751 });
4752 } else {
4753 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4754 {
4755 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4756
4757 for (local_ordinal_type row = 0; row < nrows; ++row) {
4758 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4759 auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4760
4761 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4762 member,
4763 blocksize, blocksize,
4764 -one,
4765 E.data(), E.stride(0), E.stride(1),
4766 v_2.data(), v_2.stride(0),
4767 one,
4768 v_1.data(), v_1.stride(0));
4769 }
4770 }
4771 {
4772 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4773
4774 for (local_ordinal_type row = 0; row < nrows; ++row) {
4775 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4776 auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4777
4778 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4779 member,
4780 blocksize, blocksize,
4781 -one,
4782 E.data(), E.stride(0), E.stride(1),
4783 v_2.data(), v_2.stride(0),
4784 one,
4785 v_1.data(), v_1.stride(0));
4786 }
4787 }
4788 });
4789 }
4790 }
4791
4792 template <int B>
4793 KOKKOS_INLINE_FUNCTION void
4794 operator()(const CopyVectorToFlatTag<B> &, const member_type &member) const {
4795 const local_ordinal_type packidx = member.league_rank();
4796 const local_ordinal_type partidx = packptr(packidx);
4797 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4798 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4799 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4800 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4801
4802 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4803 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4804 });
4805 }
4806
4807 template <int B>
4808 KOKKOS_INLINE_FUNCTION void
4809 operator()(const SingleZeroingTag<B> &, const member_type &member) const {
4810 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4811 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4812 });
4813 }
4814
4815 void run(const impl_scalar_type_2d_view_tpetra &Y,
4816 const impl_scalar_type_1d_view &Z) {
4817 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4818 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SolveTridiags", SolveTridiags);
4819
4821 this->Y_scalar_multivector = Y;
4822 this->Z_scalar_vector = Z;
4823
4824 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4825 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4826
4827 const local_ordinal_type team_size =
4828 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4829 recommended_team_size(blocksize, vector_length, internal_vector_length);
4830 const int per_team_scratch = internal_vector_scratch_type_3d_view ::shmem_size(blocksize, num_vectors, vector_loop_size);
4831
4832#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4833 if (packindices_schur.extent(1) <= 0) { \
4834 if (num_vectors == 1) { \
4835 Kokkos::TeamPolicy<execution_space, SingleVectorTag<B>> \
4836 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4837 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4838 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4839 policy, *this); \
4840 } else { \
4841 Kokkos::TeamPolicy<execution_space, MultiVectorTag<B>> \
4842 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4843 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4844 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<MultiVector>", \
4845 policy, *this); \
4846 } \
4847 } else { \
4848 { \
4849 Kokkos::TeamPolicy<execution_space, SingleZeroingTag<B>> \
4850 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4851 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4852 policy, *this); \
4853 } \
4854 for (local_ordinal_type vec = 0; vec < num_vectors; vec++) { \
4855 this->active_schur_solve_vec = vec; \
4856 { \
4857 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4858 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4859 Kokkos::TeamPolicy<execution_space, SingleVectorSubLineTag<B>> \
4860 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4861 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4862 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4863 policy, *this); \
4864 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4865 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4866 } \
4867 { \
4868 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4869 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4870 Kokkos::TeamPolicy<execution_space, SingleVectorApplyCTag<B>> \
4871 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4872 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4873 policy, *this); \
4874 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4875 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4876 } \
4877 { \
4878 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4879 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4880 Kokkos::TeamPolicy<execution_space, SingleVectorSchurTag<B>> \
4881 policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4882 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4883 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4884 policy, *this); \
4885 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4886 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4887 } \
4888 { \
4889 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4890 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4891 Kokkos::TeamPolicy<execution_space, SingleVectorApplyETag<B>> \
4892 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4893 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4894 policy, *this); \
4895 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4896 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4897 } \
4898 } \
4899 { \
4900 Kokkos::TeamPolicy<execution_space, CopyVectorToFlatTag<B>> \
4901 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4902 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<CopyVectorToFlatTag>", \
4903 policy, *this); \
4904 } \
4905 } \
4906 break
4907 switch (blocksize) {
4908 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(3);
4909 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(5);
4910 case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(6);
4911 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(7);
4912 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4913 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4914 case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4915 case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4916 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4917 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4918 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4919 case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4920 default: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(0);
4921 }
4922#undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
4923
4924 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
4925 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
4926 }
4927};
4928
4932template <typename MatrixType>
4933int applyInverseJacobi( // importer
4934 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
4935 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
4936 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
4937 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
4938 const bool overlap_communication_and_computation,
4939 // tpetra interface
4940 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
4941 /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
4942 /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
4943 /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
4944 // local object interface
4945 const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
4946 const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
4947 const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
4948 /* */ typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
4950 // preconditioner parameters
4952 /* */ bool is_y_zero,
4953 const int max_num_sweeps,
4954 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
4955 const int check_tol_every) {
4956 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
4957
4959 using node_memory_space = typename impl_type::node_memory_space;
4960 using local_ordinal_type = typename impl_type::local_ordinal_type;
4961 using size_type = typename impl_type::size_type;
4962 using impl_scalar_type = typename impl_type::impl_scalar_type;
4963 using magnitude_type = typename impl_type::magnitude_type;
4964 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4965 using vector_type_1d_view = typename impl_type::vector_type_1d_view;
4966 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4967 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
4968
4969 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4970
4971 // either tpetra importer or async importer must be active
4972 TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
4973 "Neither Tpetra importer nor Async importer is null.");
4974 // max number of sweeps should be positive number
4975 TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
4976 "Maximum number of sweeps must be >= 1.");
4977
4978 // const parameters
4979 const bool is_seq_method_requested = !tpetra_importer.is_null();
4980 const bool is_async_importer_active = !async_importer.is_null();
4981 const bool is_norm_manager_active = tol > KokkosKernels::ArithTraits<magnitude_type>::zero();
4982 const magnitude_type tolerance = tol * tol;
4983 const local_ordinal_type blocksize = btdm.values.extent(1);
4984 const local_ordinal_type num_vectors = Y.getNumVectors();
4985 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4986
4987 const impl_scalar_type zero(0.0);
4988
4989 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4990 "The seq method for applyInverseJacobi, "
4991 << "which in any case is for developer use only, "
4992 << "does not support norm-based termination.");
4993 const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4994 Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4995 TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
4996 std::invalid_argument,
4997 "The seq method for applyInverseJacobi, "
4998 << "which in any case is for developer use only, "
4999 << "only supports memory spaces accessible from host.");
5000
5001 // if workspace is needed more, resize it
5002 const size_type work_span_required = num_blockrows * num_vectors * blocksize;
5003 if (work.span() < work_span_required)
5004 work = vector_type_1d_view("vector workspace 1d view", work_span_required);
5005
5006 // construct W
5007 const local_ordinal_type W_size = interf.packptr.extent(0) - 1;
5008 if (local_ordinal_type(W.extent(0)) < W_size)
5009 W = impl_scalar_type_1d_view("W", W_size);
5010
5011 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5012 {
5013 if (is_seq_method_requested) {
5014 if (Z.getNumVectors() != Y.getNumVectors())
5015 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
5016 } else {
5017 if (is_async_importer_active) {
5018 // create comm data buffer and keep it here
5019 async_importer->createDataBuffer(num_vectors);
5020 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5021 }
5022 }
5023 }
5024
5025 // wrap the workspace with 3d view
5026 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
5027 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5028 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5029 const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
5030 if (is_y_zero) Kokkos::deep_copy(YY, zero);
5031
5032 MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
5033 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
5034 damping_factor, is_norm_manager_active);
5035
5036 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
5037
5038 auto A_crs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_crs_matrix_type>(A);
5039 auto A_bcrs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_block_crs_matrix_type>(A);
5040
5041 bool hasBlockCrsMatrix = !A_bcrs.is_null();
5042
5043 // This is OK here to use the graph of the A_crs matrix and a block size of 1
5044 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
5045
5046 BlockHelperDetails::ComputeResidualVector<MatrixType>
5047 compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
5048 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
5049 hasBlockCrsMatrix);
5050
5051 // norm manager workspace resize
5052 if (is_norm_manager_active)
5053 norm_manager.setCheckFrequency(check_tol_every);
5054
5055 // iterate
5056 int sweep = 0;
5057 for (; sweep < max_num_sweeps; ++sweep) {
5058 {
5059 if (is_y_zero) {
5060 // pmv := x(lclrow)
5061 multivector_converter.run(XX);
5062 } else {
5063 if (is_seq_method_requested) {
5064 // SEQ METHOD IS TESTING ONLY
5065
5066 // y := x - R y
5067 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5068 compute_residual_vector.run(YY, XX, ZZ);
5069
5070 // pmv := y(lclrow).
5071 multivector_converter.run(YY);
5072 } else {
5073 // fused y := x - R y and pmv := y(lclrow);
5074 // real use case does not use overlap comp and comm
5075 if (overlap_communication_and_computation || !is_async_importer_active) {
5076 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5077 // OverlapTag, compute_owned = true
5078 compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
5079 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5080 if (is_async_importer_active) async_importer->cancel();
5081 break;
5082 }
5083 if (is_async_importer_active) {
5084 async_importer->syncRecv();
5085 // OverlapTag, compute_owned = false
5086 compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
5087 }
5088 } else {
5089 if (is_async_importer_active)
5090 async_importer->syncExchange(YY);
5091 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5092 // AsyncTag
5093 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5094 }
5095 }
5096 }
5097 }
5098
5099 // pmv := inv(D) pmv.
5100 {
5101 solve_tridiags.run(YY, W);
5102 }
5103 {
5104 if (is_norm_manager_active) {
5105 // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
5106 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5107 if (sweep + 1 == max_num_sweeps) {
5108 norm_manager.ireduce(sweep, true);
5109 norm_manager.checkDone(sweep + 1, tolerance, true);
5110 } else {
5111 norm_manager.ireduce(sweep);
5112 }
5113 }
5114 }
5115 is_y_zero = false;
5116 }
5117
5118 // sqrt the norms for the caller's use.
5119 if (is_norm_manager_active) norm_manager.finalize();
5120
5121 return sweep;
5122}
5123
5124// Implementation of fused block Jacobi for a specific block size,
5125// or (if B == 0) for a general block size.
5126template <typename MatrixType, int B>
5127int applyFusedBlockJacobi_Impl(
5128 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5129 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5130 const bool overlap_communication_and_computation,
5131 // tpetra interface
5132 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5133 /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5134 /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5135 // local object interface
5136 const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5137 const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5138 const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5139 /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5141 // preconditioner parameters
5143 /* */ bool is_y_zero,
5144 const int max_num_sweeps,
5145 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5146 const int check_tol_every) {
5148 using local_ordinal_type = typename impl_type::local_ordinal_type;
5149 using size_type = typename impl_type::size_type;
5150 using magnitude_type = typename impl_type::magnitude_type;
5151 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
5152 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
5153
5154 // the tpetra importer and async importer can't both be active
5155 TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
5156 "Neither Tpetra importer nor Async importer is null.");
5157 // max number of sweeps should be positive number
5158 TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
5159 "Maximum number of sweeps must be >= 1.");
5160
5161 // const parameters
5162 const bool is_async_importer_active = !async_importer.is_null();
5163 const bool is_norm_manager_active = tol > KokkosKernels::ArithTraits<magnitude_type>::zero();
5164 const magnitude_type tolerance = tol * tol;
5165 const local_ordinal_type blocksize = btdm.d_inv.extent(1);
5166 const local_ordinal_type num_vectors = Y.getNumVectors();
5167 const local_ordinal_type num_blockrows = interf.nparts;
5168
5169 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5170 {
5171 if (is_async_importer_active) {
5172 // create comm data buffer and keep it here
5173 async_importer->createDataBuffer(num_vectors);
5174 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5175 }
5176 }
5177
5178 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5179 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5180
5181 const bool two_pass_residual =
5182 overlap_communication_and_computation && is_async_importer_active;
5183
5184 // Calculate the required work size and reallocate it if not already big enough.
5185 // Check that our assumptions about YY dimension are correct.
5186 TEUCHOS_TEST_FOR_EXCEPT_MSG(
5187 size_t(num_blockrows) * blocksize * num_vectors != YY.extent(0) * YY.extent(1),
5188 "Local LHS vector (YY) has total size " << YY.extent(0) << "x" << YY.extent(1) << " = " << YY.extent(0) * YY.extent(1) << ",\n"
5189 << "but expected " << num_blockrows << "x" << blocksize << "x" << num_vectors << " = " << size_t(num_blockrows) * blocksize * num_vectors << '\n');
5190 size_type work_required = size_type(num_blockrows) * blocksize * num_vectors;
5191 if (work.extent(0) < work_required) {
5192 work = impl_scalar_type_1d_view(do_not_initialize_tag("flat workspace 1d view"), work_required);
5193 }
5194
5195 Unmanaged<impl_scalar_type_2d_view_tpetra> y_doublebuf(work.data(), num_blockrows * blocksize, num_vectors);
5196
5197 // construct W
5198 if (W.extent(0) != size_t(num_blockrows))
5199 W = impl_scalar_type_1d_view(do_not_initialize_tag("W"), num_blockrows);
5200
5201 // Create the required functors upfront (this is inexpensive - all shallow copies)
5202 BlockHelperDetails::ComputeResidualAndSolve_SolveOnly<MatrixType, B>
5203 functor_solve_only(amd, btdm.d_inv, W, blocksize, damping_factor);
5204 BlockHelperDetails::ComputeResidualAndSolve_1Pass<MatrixType, B>
5205 functor_1pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5206 BlockHelperDetails::ComputeResidualAndSolve_2Pass<MatrixType, B>
5207 functor_2pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5208
5209 // norm manager workspace resize
5210 if (is_norm_manager_active)
5211 norm_manager.setCheckFrequency(check_tol_every);
5212
5213 // For double-buffering.
5214 // yy_buffers[current_y] has the current iterate of y.
5215 // yy_buffers[1-current_y] has the next iterate of y.
5216 Unmanaged<impl_scalar_type_2d_view_tpetra> y_buffers[2] = {YY, y_doublebuf};
5217 int current_y = 0;
5218
5219 // iterate
5220 int sweep = 0;
5221 for (; sweep < max_num_sweeps; ++sweep) {
5222 if (is_y_zero) {
5223 // If y is initially zero, then we are just computing y := damping_factor * Dinv * x
5224 functor_solve_only.run(XX, y_buffers[1 - current_y]);
5225 } else {
5226 // real use case does not use overlap comp and comm
5227 if (overlap_communication_and_computation || !is_async_importer_active) {
5228 if (is_async_importer_active) async_importer->asyncSendRecv(y_buffers[current_y]);
5229 if (two_pass_residual) {
5230 // Pass 1 computes owned residual and stores into new y buffer,
5231 // but doesn't apply Dinv or produce a norm yet
5232 functor_2pass.run_pass1(XX, y_buffers[current_y], y_buffers[1 - current_y]);
5233 } else {
5234 // This case happens if running with single rank.
5235 // There are no remote columns, so residual and solve can happen in one step.
5236 functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5237 }
5238 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5239 if (is_async_importer_active) async_importer->cancel();
5240 break;
5241 }
5242 if (is_async_importer_active) {
5243 async_importer->syncRecv();
5244 // Stage 2 finishes computing the residual, then applies Dinv and computes norm.
5245 functor_2pass.run_pass2(y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5246 }
5247 } else {
5248 if (is_async_importer_active)
5249 async_importer->syncExchange(y_buffers[current_y]);
5250 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5251 // Full residual, Dinv apply, and norm in one kernel
5252 functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5253 }
5254 }
5255
5256 // Compute global norm.
5257 if (is_norm_manager_active) {
5258 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5259 if (sweep + 1 == max_num_sweeps) {
5260 norm_manager.ireduce(sweep, true);
5261 norm_manager.checkDone(sweep + 1, tolerance, true);
5262 } else {
5263 norm_manager.ireduce(sweep);
5264 }
5265 }
5266 is_y_zero = false;
5267 // flip y buffers for next iteration, or termination if we reached max_num_sweeps.
5268 current_y = 1 - current_y;
5269 }
5270 if (current_y == 1) {
5271 // We finished iterating with y in the double buffer, so copy it to the user's vector.
5272 Kokkos::deep_copy(YY, y_doublebuf);
5273 }
5274
5275 // sqrt the norms for the caller's use.
5276 if (is_norm_manager_active) norm_manager.finalize();
5277 return sweep;
5278}
5279
5283template <typename MatrixType>
5285 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5286 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5287 const bool overlap_communication_and_computation,
5288 // tpetra interface
5289 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5290 /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5291 /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5292 // local object interface
5293 const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5294 const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5295 const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5296 /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5298 // preconditioner parameters
5300 /* */ bool is_y_zero,
5301 const int max_num_sweeps,
5302 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5303 const int check_tol_every) {
5304 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyFusedBlockJacobi", ApplyFusedBlockJacobi);
5305 int blocksize = btdm.d_inv.extent(1);
5306 int sweep = 0;
5307#define BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(B) \
5308 { \
5309 sweep = applyFusedBlockJacobi_Impl<MatrixType, B>( \
5310 tpetra_importer, async_importer, overlap_communication_and_computation, \
5311 X, Y, W, interf, btdm, amd, work, \
5312 norm_manager, damping_factor, is_y_zero, \
5313 max_num_sweeps, tol, check_tol_every); \
5314 } \
5315 break
5316 switch (blocksize) {
5317 case 3: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(3);
5318 case 5: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(5);
5319 case 7: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(7);
5320 case 9: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(9);
5321 case 10: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(10);
5322 case 11: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(11);
5323 case 16: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(16);
5324 case 17: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(17);
5325 case 18: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(18);
5326 default: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(0);
5327 }
5328#undef BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI
5329
5330 return sweep;
5331}
5332
5333template <typename MatrixType>
5336 using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5340 using async_import_type = AsyncableImport<MatrixType>;
5341
5342 // distructed objects
5343 Teuchos::RCP<const typename impl_type::tpetra_row_matrix_type> A;
5344 Teuchos::RCP<const typename impl_type::tpetra_crs_graph_type> blockGraph;
5345 Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
5346 Teuchos::RCP<async_import_type> async_importer;
5347 bool overlap_communication_and_computation;
5348
5349 // copy of Y (mutable to penentrate const)
5350 mutable typename impl_type::tpetra_multivector_type Z;
5351 mutable typename impl_type::impl_scalar_type_1d_view W;
5352
5353 // local objects
5354 part_interface_type part_interface;
5355 block_tridiags_type block_tridiags; // D
5356 amd_type a_minus_d; // R = A - D
5357
5358 // whether to use fused block Jacobi path
5359 bool use_fused_jacobi;
5360
5361 // vector workspace is used for general block tridi case
5362 mutable typename impl_type::vector_type_1d_view work; // right hand side workspace (1D view of vector)
5363 // scalar workspace is used for fused block jacobi case
5364 mutable typename impl_type::impl_scalar_type_1d_view work_flat; // right hand side workspace (1D view of scalar)
5365 mutable norm_manager_type norm_manager;
5366};
5367
5368} // namespace BlockTriDiContainerDetails
5369
5370} // namespace Ifpack2
5371
5372#endif
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type > > &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1093
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:4933
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, bool useSeqMethod, bool use_fused_jacobi)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1929
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:167
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny, bool use_fused_jacobi)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:3719
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:886
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition Ifpack2_BlockTriDiContainer_impl.hpp:95
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1677
int applyFusedBlockJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:5284
Preconditioners and smoothers for Tpetra sparse matrices.
Definition Ifpack2_AdditiveSchwarz_decl.hpp:40
Definition Ifpack2_BlockComputeResidualVector.hpp:23
Definition Ifpack2_BlockHelper.hpp:211
Definition Ifpack2_BlockHelper.hpp:270
size_t size_type
Definition Ifpack2_BlockHelper.hpp:274
KokkosKernels::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition Ifpack2_BlockHelper.hpp:283
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition Ifpack2_BlockHelper.hpp:346
Definition Ifpack2_BlockHelper.hpp:377
Definition Ifpack2_BlockHelper.hpp:236
Definition Ifpack2_BlockTriDiContainer_impl.hpp:139
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1604
forward declaration
Definition Ifpack2_BlockTriDiContainer_impl.hpp:5334
Definition Ifpack2_BlockTriDiContainer_impl.hpp:3769