Ifpack2 Templated Preconditioning Package Version 1.0
Loading...
Searching...
No Matches
Ifpack2_BlockTriDiContainer_impl.hpp
1// @HEADER
2// *****************************************************************************
3// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4//
5// Copyright 2009 NTESS and the Ifpack2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11#define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
12
13// #define IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
14// #define IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
15
16#include <Teuchos_Details_MpiTypeTraits.hpp>
17
18#include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19#include <Tpetra_Distributor.hpp>
20#include <Tpetra_BlockMultiVector.hpp>
21
22#if KOKKOS_VERSION >= 40799
23#include <KokkosKernels_ArithTraits.hpp>
24#else
25#include <Kokkos_ArithTraits.hpp>
26#endif
27#include <KokkosBatched_Util.hpp>
28#include <KokkosBatched_Vector.hpp>
29#include <KokkosBatched_Copy_Decl.hpp>
30#include <KokkosBatched_Copy_Impl.hpp>
31#include <KokkosBatched_AddRadial_Decl.hpp>
32#include <KokkosBatched_AddRadial_Impl.hpp>
33#include <KokkosBatched_SetIdentity_Decl.hpp>
34#include <KokkosBatched_SetIdentity_Impl.hpp>
35#include <KokkosBatched_Gemm_Decl.hpp>
36#include <KokkosBatched_Gemm_Serial_Impl.hpp>
37#include <KokkosBatched_Gemm_Team_Impl.hpp>
38#include <KokkosBatched_Gemv_Decl.hpp>
39#include <KokkosBatched_Gemv_Team_Impl.hpp>
40#include <KokkosBatched_Trsm_Decl.hpp>
41#include <KokkosBatched_Trsm_Serial_Impl.hpp>
42#include <KokkosBatched_Trsm_Team_Impl.hpp>
43#include <KokkosBatched_Trsv_Decl.hpp>
44#include <KokkosBatched_Trsv_Serial_Impl.hpp>
45#include <KokkosBatched_Trsv_Team_Impl.hpp>
46#include <KokkosBatched_LU_Decl.hpp>
47#include <KokkosBatched_LU_Serial_Impl.hpp>
48#include <KokkosBatched_LU_Team_Impl.hpp>
49
50#include <KokkosBlas1_nrm1.hpp>
51#include <KokkosBlas1_nrm2.hpp>
52
53#include <memory>
54
55#include "Ifpack2_BlockHelper.hpp"
56#include "Ifpack2_BlockComputeResidualVector.hpp"
57#include "Ifpack2_BlockComputeResidualAndSolve.hpp"
58
59// need to interface this into cmake variable (or only use this flag when it is necessary)
60// #define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
61// #undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
62#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
63#include "cuda_profiler_api.h"
64#endif
65
66// I am not 100% sure about the mpi 3 on cuda
67#if MPI_VERSION >= 3
68#define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
69#endif
70
71// ::: Experiments :::
72// define either pinned memory or cudamemory for mpi
73// if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
74// if defined, this use pinned memory instead of device pointer
75// by default, we enable pinned memory
76#define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
77// #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
78
79// if defined, all views are allocated on cuda space intead of cuda uvm space
80#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
81
82// if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
83#if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
84#define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
85#endif
86
87// if defined, it uses multiple execution spaces
88#define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
89
90namespace Ifpack2 {
91
92namespace BlockTriDiContainerDetails {
93
94namespace KB = KokkosBatched;
95
99using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
100
101template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
102using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
103 MemoryTraitsType::is_random_access |
104 flag>;
105
106template <typename ViewType>
107using Unmanaged = Kokkos::View<typename ViewType::data_type,
108 typename ViewType::array_layout,
109 typename ViewType::device_type,
110 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
111template <typename ViewType>
112using Atomic = Kokkos::View<typename ViewType::data_type,
113 typename ViewType::array_layout,
114 typename ViewType::device_type,
115 MemoryTraits<typename ViewType::memory_traits, Kokkos::Atomic>>;
116template <typename ViewType>
117using Const = Kokkos::View<typename ViewType::const_data_type,
118 typename ViewType::array_layout,
119 typename ViewType::device_type,
120 typename ViewType::memory_traits>;
121template <typename ViewType>
122using ConstUnmanaged = Const<Unmanaged<ViewType>>;
123
124template <typename ViewType>
125using AtomicUnmanaged = Atomic<Unmanaged<ViewType>>;
126
127template <typename ViewType>
128using Unmanaged = Kokkos::View<typename ViewType::data_type,
129 typename ViewType::array_layout,
130 typename ViewType::device_type,
131 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
132
133template <typename ViewType>
134using Scratch = Kokkos::View<typename ViewType::data_type,
135 typename ViewType::array_layout,
136 typename ViewType::execution_space::scratch_memory_space,
137 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
138
142template <typename T>
144 typedef T type;
145};
146#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
147template <>
148struct BlockTridiagScalarType<double> {
149 typedef float type;
150};
151// template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
152#endif
153
154#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
155#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
156 KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
157
158#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
159 { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStop()); }
160#else
162#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
163#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
164#endif
165
169template <typename MatrixType>
170typename Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type>
171createBlockCrsTpetraImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
172 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
174 using tpetra_map_type = typename impl_type::tpetra_map_type;
175 using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
176 using tpetra_import_type = typename impl_type::tpetra_import_type;
177 using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
178 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
179
180 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
181 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
182
183 bool hasBlockCrsMatrix = !A_bcrs.is_null();
184
185 // This is OK here to use the graph of the A_crs matrix and a block size of 1
186 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
187
188 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
189 const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
190 const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap(), blocksize)));
191 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
192 return Teuchos::rcp(new tpetra_import_type(src, tgt));
193}
194
195// Partial replacement for forward-mode MultiVector::doImport.
196// Permits overlapped communication and computation, but also supports sync'ed.
197// I'm finding that overlapped comm/comp can give quite poor performance on some
198// platforms, so we can't just use it straightforwardly always.
199
200template <typename MatrixType>
201struct AsyncableImport {
202 public:
204
205 private:
209#if !defined(HAVE_IFPACK2_MPI)
210 typedef int MPI_Request;
211 typedef int MPI_Comm;
212#endif
215 using scalar_type = typename impl_type::scalar_type;
216
217 static int isend(const MPI_Comm comm, const char *buf, int count, int dest, int tag, MPI_Request *ireq) {
218#ifdef HAVE_IFPACK2_MPI
219 MPI_Request ureq;
220 int ret = MPI_Isend(const_cast<char *>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
221 if (ireq == NULL) MPI_Request_free(&ureq);
222 return ret;
223#else
224 return 0;
225#endif
226 }
227
228 static int irecv(const MPI_Comm comm, char *buf, int count, int src, int tag, MPI_Request *ireq) {
229#ifdef HAVE_IFPACK2_MPI
230 MPI_Request ureq;
231 int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
232 if (ireq == NULL) MPI_Request_free(&ureq);
233 return ret;
234#else
235 return 0;
236#endif
237 }
238
239 static int waitany(int count, MPI_Request *reqs, int *index) {
240#ifdef HAVE_IFPACK2_MPI
241 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
242#else
243 return 0;
244#endif
245 }
246
247 static int waitall(int count, MPI_Request *reqs) {
248#ifdef HAVE_IFPACK2_MPI
249 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
250#else
251 return 0;
252#endif
253 }
254
255 public:
256 using tpetra_map_type = typename impl_type::tpetra_map_type;
257 using tpetra_import_type = typename impl_type::tpetra_import_type;
258
259 using local_ordinal_type = typename impl_type::local_ordinal_type;
260 using global_ordinal_type = typename impl_type::global_ordinal_type;
261 using size_type = typename impl_type::size_type;
262 using impl_scalar_type = typename impl_type::impl_scalar_type;
263
264 using int_1d_view_host = Kokkos::View<int *, Kokkos::HostSpace>;
265 using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type *, Kokkos::HostSpace>;
266
267 using execution_space = typename impl_type::execution_space;
268 using memory_space = typename impl_type::memory_space;
269 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
270 using size_type_1d_view = typename impl_type::size_type_1d_view;
271 using size_type_1d_view_host = Kokkos::View<size_type *, Kokkos::HostSpace>;
272
273#if defined(KOKKOS_ENABLE_CUDA)
274 using impl_scalar_type_1d_view =
275 typename std::conditional<std::is_same<execution_space, Kokkos::Cuda>::value,
276#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
277 Kokkos::View<impl_scalar_type *, Kokkos::CudaHostPinnedSpace>,
278#elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
279 Kokkos::View<impl_scalar_type *, Kokkos::CudaSpace>,
280#else // no experimental macros are defined
281 typename impl_type::impl_scalar_type_1d_view,
282#endif
283 typename impl_type::impl_scalar_type_1d_view>::type;
284#else
285 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
286#endif
287 using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type *, Kokkos::HostSpace>;
288 using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
289 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
290
291#ifdef HAVE_IFPACK2_MPI
292 MPI_Comm comm;
293#endif
294
295 impl_scalar_type_2d_view_tpetra remote_multivector;
296 local_ordinal_type blocksize;
297
298 template <typename T>
299 struct SendRecvPair {
300 T send, recv;
301 };
302
303 // (s)end and (r)eceive data:
304 SendRecvPair<int_1d_view_host> pids; // mpi ranks
305 SendRecvPair<std::vector<MPI_Request>> reqs; // MPI_Request is pointer, cannot use kokkos view
306 SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
307 SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
308 SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
309 SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
310 SendRecvPair<impl_scalar_type_1d_view_host> buffer_host; // data buffer
311
312 local_ordinal_type_1d_view dm2cm; // permutation
313
314#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
315 using exec_instance_1d_std_vector = std::vector<execution_space>;
316 exec_instance_1d_std_vector exec_instances;
317#endif
318
319 // for cuda
320 public:
321 void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
322 const size_type_1d_view &offs) {
323 // wrap lens to kokkos view and deep copy to device
324 Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.getRawPtr()), lens.size());
325 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
326
327 // exclusive scan
328 const Kokkos::RangePolicy<execution_space> policy(0, offs.extent(0));
329 const local_ordinal_type lens_size = lens_device.extent(0);
330 Kokkos::parallel_scan(
331 "AsyncableImport::RangePolicy::setOffsetValues",
332 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
333 if (final)
334 offs(i) = update;
335 update += (i < lens_size ? lens_device[i] : 0);
336 });
337 }
338
339 void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
340 const size_type_1d_view_host &offs) {
341 // wrap lens to kokkos view and deep copy to device
342 Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.getRawPtr()), lens.size());
343 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
344
345 // exclusive scan
346 offs(0) = 0;
347 for (local_ordinal_type i = 1, iend = offs.extent(0); i < iend; ++i) {
348 offs(i) = offs(i - 1) + lens[i - 1];
349 }
350 }
351
352 private:
353 void createMpiRequests(const tpetra_import_type &import) {
354 Tpetra::Distributor &distributor = import.getDistributor();
355
356 // copy pids from distributor
357 const auto pids_from = distributor.getProcsFrom();
358 pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
359 memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int) * pids.recv.extent(0));
360
361 const auto pids_to = distributor.getProcsTo();
362 pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
363 memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int) * pids.send.extent(0));
364
365 // mpi requests
366 reqs.recv.resize(pids.recv.extent(0));
367 memset(reqs.recv.data(), 0, reqs.recv.size() * sizeof(MPI_Request));
368 reqs.send.resize(pids.send.extent(0));
369 memset(reqs.send.data(), 0, reqs.send.size() * sizeof(MPI_Request));
370
371 // construct offsets
372#if 0
373 const auto lengths_to = distributor.getLengthsTo();
374 offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
375
376 const auto lengths_from = distributor.getLengthsFrom();
377 offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
378
379 setOffsetValues(lengths_to, offset.send);
380 offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
381
382 setOffsetValues(lengths_from, offset.recv);
383 offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
384#else
385 const auto lengths_to = distributor.getLengthsTo();
386 offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
387
388 const auto lengths_from = distributor.getLengthsFrom();
389 offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
390
391 setOffsetValuesHost(lengths_to, offset_host.send);
392 // offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
393
394 setOffsetValuesHost(lengths_from, offset_host.recv);
395 // offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
396#endif
397 }
398
399 void createSendRecvIDs(const tpetra_import_type &import) {
400 // For each remote PID, the list of LIDs to receive.
401 const auto remote_lids = import.getRemoteLIDs();
402 const local_ordinal_type_1d_view_host
403 remote_lids_view_host(const_cast<local_ordinal_type *>(remote_lids.getRawPtr()), remote_lids.size());
404 lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
405 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
406
407 // For each export PID, the list of LIDs to send.
408 auto epids = import.getExportPIDs();
409 auto elids = import.getExportLIDs();
410 TEUCHOS_ASSERT(epids.size() == elids.size());
411 lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
412 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
413
414 // naive search (not sure if pids or epids are sorted)
415 for (local_ordinal_type cnt = 0, i = 0, iend = pids.send.extent(0); i < iend; ++i) {
416 const auto pid_send_value = pids.send[i];
417 for (local_ordinal_type j = 0, jend = epids.size(); j < jend; ++j)
418 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
419 TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i + 1]);
420 }
421 Kokkos::deep_copy(lids.send, lids_send_host);
422 }
423
424 void createExecutionSpaceInstances() {
425#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
426 // The following line creates 8 streams:
427#if KOKKOS_VERSION >= 40699
428 exec_instances =
429 Kokkos::Experimental::partition_space(execution_space(), std::vector<int>(8, 1));
430#else
431 exec_instances =
432 Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
433#endif
434#endif
435 }
436
437 public:
438 // for cuda, all tag types are public
439 struct ToBuffer {};
440 struct ToMultiVector {};
441
442 AsyncableImport(const Teuchos::RCP<const tpetra_map_type> &src_map,
443 const Teuchos::RCP<const tpetra_map_type> &tgt_map,
444 const local_ordinal_type blocksize_,
445 const local_ordinal_type_1d_view dm2cm_) {
446 blocksize = blocksize_;
447 dm2cm = dm2cm_;
448
449#ifdef HAVE_IFPACK2_MPI
450 comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
451#endif
452 const tpetra_import_type import(src_map, tgt_map);
453
454 createMpiRequests(import);
455 createSendRecvIDs(import);
456 createExecutionSpaceInstances();
457 }
458
459 void createDataBuffer(const local_ordinal_type &num_vectors) {
460 const size_type extent_0 = lids.recv.extent(0) * blocksize;
461 const size_type extent_1 = num_vectors;
462 if (remote_multivector.extent(0) == extent_0 &&
463 remote_multivector.extent(1) == extent_1) {
464 // skip
465 } else {
466 remote_multivector =
467 impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
468
469 const auto send_buffer_size = offset_host.send[offset_host.send.extent(0) - 1] * blocksize * num_vectors;
470 const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0) - 1] * blocksize * num_vectors;
471
472 buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
473 buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
474
475 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
476 buffer_host.send = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer send"), send_buffer_size);
477 buffer_host.recv = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer recv"), recv_buffer_size);
478 }
479 }
480 }
481
482 void cancel() {
483#ifdef HAVE_IFPACK2_MPI
484 waitall(reqs.recv.size(), reqs.recv.data());
485 waitall(reqs.send.size(), reqs.send.data());
486#endif
487 }
488
489 // ======================================================================
490 // Async version using execution space instances
491 // ======================================================================
492
493#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
494 template <typename PackTag>
495 static void copy(const local_ordinal_type_1d_view &lids_,
496 const impl_scalar_type_1d_view &buffer_,
497 const local_ordinal_type ibeg_,
498 const local_ordinal_type iend_,
499 const impl_scalar_type_2d_view_tpetra &multivector_,
500 const local_ordinal_type blocksize_,
501 const execution_space &exec_instance_) {
502 const local_ordinal_type num_vectors = multivector_.extent(1);
503 const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
504 const local_ordinal_type idiff = iend_ - ibeg_;
505 const auto abase = buffer_.data() + mv_blocksize * ibeg_;
506
507 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
508 local_ordinal_type vector_size(0);
509 if (blocksize_ <= 4)
510 vector_size = 4;
511 else if (blocksize_ <= 8)
512 vector_size = 8;
513 else if (blocksize_ <= 16)
514 vector_size = 16;
515 else
516 vector_size = 32;
517
518 const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
519 const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
520 Kokkos::parallel_for( //"AsyncableImport::TeamPolicy::copyViaCudaStream",
521 Kokkos::Experimental::require(policy, work_item_property),
522 KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
523 const local_ordinal_type i = member.league_rank();
524 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](const local_ordinal_type &j) {
525 auto aptr = abase + blocksize_ * (i + idiff * j);
526 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
527 if (std::is_same<PackTag, ToBuffer>::value)
528 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
529 aptr[k] = bptr[k];
530 });
531 else
532 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
533 bptr[k] = aptr[k];
534 });
535 });
536 });
537 }
538
539 void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
540 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
541
542#ifdef HAVE_IFPACK2_MPI
543 // constants and reallocate data buffers if necessary
544 const local_ordinal_type num_vectors = mv.extent(1);
545 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
546
547 // 0. post receive async
548 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
549 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
550 irecv(comm,
551 reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
552 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
553 pids.recv[i],
554 42,
555 &reqs.recv[i]);
556 } else {
557 irecv(comm,
558 reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
559 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
560 pids.recv[i],
561 42,
562 &reqs.recv[i]);
563 }
564 }
565
567 execution_space().fence();
568
569 // 1. async memcpy
570 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
571 // 1.0. enqueue pack buffer
572 if (i < 8) exec_instances[i % 8].fence();
573 copy<ToBuffer>(lids.send, buffer.send,
574 offset_host.send(i), offset_host.send(i + 1),
575 mv, blocksize,
576 // execution_space());
577 exec_instances[i % 8]);
578 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
579 // if (i<8) exec_instances[i%8].fence();
580 const local_ordinal_type num_vectors = mv.extent(1);
581 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
582
583 Kokkos::deep_copy(exec_instances[i % 8],
584 Kokkos::subview(buffer_host.send,
585 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
586 offset_host.send(i) * mv_blocksize,
587 offset_host.send(i + 1) * mv_blocksize)),
588 Kokkos::subview(buffer.send,
589 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
590 offset_host.send(i) * mv_blocksize,
591 offset_host.send(i + 1) * mv_blocksize)));
592 }
593 }
595 // execution_space().fence();
596 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
597 // 1.1. sync the stream and isend
598 if (i < 8) exec_instances[i % 8].fence();
599 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
600 isend(comm,
601 reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
602 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
603 pids.send[i],
604 42,
605 &reqs.send[i]);
606 } else {
607 isend(comm,
608 reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
609 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
610 pids.send[i],
611 42,
612 &reqs.send[i]);
613 }
614 }
615
616 // 2. poke communication
617 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
618 int flag;
619 MPI_Status stat;
620 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
621 }
622#endif // HAVE_IFPACK2_MPI
623 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
624 }
625
626 void syncRecvVar1() {
627 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
628#ifdef HAVE_IFPACK2_MPI
629 // 0. wait for receive async.
630 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.recv.extent(0)); ++i) {
631 local_ordinal_type idx = i;
632
633 // 0.0. wait any
634 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
635
636 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
637 const local_ordinal_type num_vectors = remote_multivector.extent(1);
638 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
639
640 Kokkos::deep_copy(
641 Kokkos::subview(buffer.recv,
642 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
643 offset_host.recv(idx) * mv_blocksize,
644 offset_host.recv(idx + 1) * mv_blocksize)),
645 Kokkos::subview(buffer_host.recv,
646 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
647 offset_host.recv(idx) * mv_blocksize,
648 offset_host.recv(idx + 1) * mv_blocksize)));
649 }
650
651 // 0.1. unpack data after data is moved into a device
652 copy<ToMultiVector>(lids.recv, buffer.recv,
653 offset_host.recv(idx), offset_host.recv(idx + 1),
654 remote_multivector, blocksize,
655 exec_instances[idx % 8]);
656 }
657
658 // 1. fire up all cuda events
659 Kokkos::fence();
660
661 // 2. cleanup all open comm
662 waitall(reqs.send.size(), reqs.send.data());
663#endif // HAVE_IFPACK2_MPI
664 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
665 }
666#endif // defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
667
668 // ======================================================================
669 // Generic version without using execution space instances
670 // - only difference between device and host architecture is on using team
671 // or range policies.
672 // ======================================================================
673 template <typename PackTag>
674 static void copy(const local_ordinal_type_1d_view &lids_,
675 const impl_scalar_type_1d_view &buffer_,
676 const local_ordinal_type &ibeg_,
677 const local_ordinal_type &iend_,
678 const impl_scalar_type_2d_view_tpetra &multivector_,
679 const local_ordinal_type blocksize_) {
680 const local_ordinal_type num_vectors = multivector_.extent(1);
681 const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
682 const local_ordinal_type idiff = iend_ - ibeg_;
683 const auto abase = buffer_.data() + mv_blocksize * ibeg_;
684 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
685 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
686 local_ordinal_type vector_size(0);
687 if (blocksize_ <= 4)
688 vector_size = 4;
689 else if (blocksize_ <= 8)
690 vector_size = 8;
691 else if (blocksize_ <= 16)
692 vector_size = 16;
693 else
694 vector_size = 32;
695 const team_policy_type policy(idiff, 1, vector_size);
696 Kokkos::parallel_for(
697 "AsyncableImport::TeamPolicy::copy",
698 policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
699 const local_ordinal_type i = member.league_rank();
700 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](const local_ordinal_type &j) {
701 auto aptr = abase + blocksize_ * (i + idiff * j);
702 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
703 if (std::is_same<PackTag, ToBuffer>::value)
704 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
705 aptr[k] = bptr[k];
706 });
707 else
708 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
709 bptr[k] = aptr[k];
710 });
711 });
712 });
713 } else {
714 const Kokkos::RangePolicy<execution_space> policy(0, idiff * num_vectors);
715 Kokkos::parallel_for(
716 "AsyncableImport::RangePolicy::copy",
717 policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
718 const local_ordinal_type i = ij % idiff;
719 const local_ordinal_type j = ij / idiff;
720 auto aptr = abase + blocksize_ * (i + idiff * j);
721 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
722 auto from = std::is_same<PackTag, ToBuffer>::value ? bptr : aptr;
723 auto to = std::is_same<PackTag, ToBuffer>::value ? aptr : bptr;
724 memcpy(to, from, sizeof(impl_scalar_type) * blocksize_);
725 });
726 }
727 }
728
732 void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
733 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
734
735#ifdef HAVE_IFPACK2_MPI
736 // constants and reallocate data buffers if necessary
737 const local_ordinal_type num_vectors = mv.extent(1);
738 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
739
740 // receive async
741 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
742 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
743 irecv(comm,
744 reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
745 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
746 pids.recv[i],
747 42,
748 &reqs.recv[i]);
749 } else {
750 irecv(comm,
751 reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
752 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
753 pids.recv[i],
754 42,
755 &reqs.recv[i]);
756 }
757 }
758
759 // send async
760 for (local_ordinal_type i = 0, iend = pids.send.extent(0); i < iend; ++i) {
761 copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i + 1),
762 mv, blocksize);
763 Kokkos::fence();
764 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
765 isend(comm,
766 reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
767 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
768 pids.send[i],
769 42,
770 &reqs.send[i]);
771 } else {
772 Kokkos::deep_copy(
773 Kokkos::subview(buffer_host.send,
774 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
775 offset_host.send(i) * mv_blocksize,
776 offset_host.send(i + 1) * mv_blocksize)),
777 Kokkos::subview(buffer.send,
778 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
779 offset_host.send(i) * mv_blocksize,
780 offset_host.send(i + 1) * mv_blocksize)));
781 isend(comm,
782 reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
783 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
784 pids.send[i],
785 42,
786 &reqs.send[i]);
787 }
788 }
789
790 // I find that issuing an Iprobe seems to nudge some MPIs into action,
791 // which helps with overlapped comm/comp performance.
792 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
793 int flag;
794 MPI_Status stat;
795 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
796 }
797#endif
798 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
799 }
800
801 void syncRecvVar0() {
802 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
803#ifdef HAVE_IFPACK2_MPI
804 // receive async.
805 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
806 local_ordinal_type idx = i;
807 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
808 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
809 const local_ordinal_type num_vectors = remote_multivector.extent(1);
810 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
811 Kokkos::deep_copy(
812 Kokkos::subview(buffer.recv,
813 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
814 offset_host.recv(idx) * mv_blocksize,
815 offset_host.recv(idx + 1) * mv_blocksize)),
816 Kokkos::subview(buffer_host.recv,
817 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
818 offset_host.recv(idx) * mv_blocksize,
819 offset_host.recv(idx + 1) * mv_blocksize)));
820 }
821 copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx + 1),
822 remote_multivector, blocksize);
823 }
824 // wait on the sends to match all Isends with a cleanup operation.
825 waitall(reqs.send.size(), reqs.send.data());
826#endif
827 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
828 }
829
833 void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
834#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
835#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
836 asyncSendRecvVar1(mv);
837#else
838 asyncSendRecvVar0(mv);
839#endif
840#else
841 asyncSendRecvVar0(mv);
842#endif
843 }
844 void syncRecv() {
845#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
846#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
847 syncRecvVar1();
848#else
849 syncRecvVar0();
850#endif
851#else
852 syncRecvVar0();
853#endif
854 }
855
856 void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
857 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
858 asyncSendRecv(mv);
859 syncRecv();
860 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
861 }
862
863 impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
864};
865
866template <typename ViewType1, typename ViewType2>
867struct are_same_struct {
868 ViewType1 keys1;
869 ViewType2 keys2;
870
871 are_same_struct(ViewType1 keys1_, ViewType2 keys2_)
872 : keys1(keys1_)
873 , keys2(keys2_) {}
874 KOKKOS_INLINE_FUNCTION
875 void operator()(int i, unsigned int &count) const {
876 if (keys1(i) != keys2(i)) count++;
877 }
878};
879
880template <typename ViewType1, typename ViewType2>
881bool are_same(ViewType1 keys1, ViewType2 keys2) {
882 unsigned int are_same_ = 0;
883
884 Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
885 are_same_struct(keys1, keys2),
886 are_same_);
887 return are_same_ == 0;
888}
889
893template <typename MatrixType>
894Teuchos::RCP<AsyncableImport<MatrixType>>
895createBlockCrsAsyncImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
896 IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter", createBlockCrsAsyncImporter);
898 using tpetra_map_type = typename impl_type::tpetra_map_type;
899 using local_ordinal_type = typename impl_type::local_ordinal_type;
900 using global_ordinal_type = typename impl_type::global_ordinal_type;
901 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
902 using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
903 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
904 using global_indices_array_device_type = Kokkos::View<const global_ordinal_type *, typename tpetra_map_type::device_type>;
905
906 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
907 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
908
909 bool hasBlockCrsMatrix = !A_bcrs.is_null();
910
911 // This is OK here to use the graph of the A_crs matrix and a block size of 1
912 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
913
914 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
915 const auto domain_map = g.getDomainMap();
916 const auto column_map = g.getColMap();
917
918 std::vector<global_ordinal_type> gids;
919
920 Kokkos::Subview<global_indices_array_device_type, std::pair<int, int>> column_map_global_iD_last;
921
922 bool separate_remotes = true, found_first = false, need_owned_permutation = false;
923 {
924 IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
925
926 global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
927 global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
928
929 if (are_same(domain_map_global_iD, column_map_global_iD)) {
930 // this should be the most likely path
931 separate_remotes = true;
932 need_owned_permutation = false;
933
934 column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
935 std::pair<int, int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
936 } else {
937 // This loop is relatively expensive
938 for (size_t i = 0; i < column_map->getLocalNumElements(); ++i) {
939 const global_ordinal_type gid = column_map->getGlobalElement(i);
940 if (!domain_map->isNodeGlobalElement(gid)) {
941 found_first = true;
942 gids.push_back(gid);
943 } else if (found_first) {
944 separate_remotes = false;
945 break;
946 }
947 if (!found_first && !need_owned_permutation &&
948 domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
949 // The owned part of the domain and column maps are different
950 // orderings. We *could* do a super efficient impl of this case in the
951 // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
952 // really, if a caller cares about speed, they wouldn't make different
953 // local permutations like this. So we punt on the best impl and go for
954 // a pretty good one: the permutation is done in place in
955 // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
956 // is the presumably worse memory access pattern of the input vector.
957 need_owned_permutation = true;
958 }
959 }
960 }
961 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
962 }
963
964 if (separate_remotes) {
965 IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
966 const auto invalid = Teuchos::OrdinalTraits<global_ordinal_type>::invalid();
967 const auto parsimonious_col_map = need_owned_permutation ? Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())) : Teuchos::rcp(new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
968 if (parsimonious_col_map->getGlobalNumElements() > 0) {
969 // make the importer only if needed.
970 local_ordinal_type_1d_view dm2cm;
971 if (need_owned_permutation) {
972 dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getLocalNumElements());
973 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
974 for (size_t i = 0; i < domain_map->getLocalNumElements(); ++i)
975 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
976 Kokkos::deep_copy(dm2cm, dm2cm_host);
977 }
978 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
979 return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
980 }
981 }
982 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
983 return Teuchos::null;
984}
985
986template <typename local_ordinal_type>
987local_ordinal_type costTRSM(const local_ordinal_type block_size) {
988 return block_size * block_size;
989}
990
991template <typename local_ordinal_type>
992local_ordinal_type costGEMV(const local_ordinal_type block_size) {
993 return 2 * block_size * block_size;
994}
995
996template <typename local_ordinal_type>
997local_ordinal_type costTriDiagSolve(const local_ordinal_type subline_length, const local_ordinal_type block_size) {
998 return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length - 1) * costGEMV(block_size);
999}
1000
1001template <typename local_ordinal_type>
1002local_ordinal_type costSolveSchur(const local_ordinal_type num_parts,
1003 const local_ordinal_type num_teams,
1004 const local_ordinal_type line_length,
1005 const local_ordinal_type block_size,
1006 const local_ordinal_type n_subparts_per_part) {
1007 const local_ordinal_type subline_length = ceil(double(line_length - (n_subparts_per_part - 1) * 2) / n_subparts_per_part);
1008 if (subline_length < 1) {
1009 return INT_MAX;
1010 }
1011
1012 const local_ordinal_type p_n_lines = ceil(double(num_parts) / num_teams);
1013 const local_ordinal_type p_n_sublines = ceil(double(n_subparts_per_part) * num_parts / num_teams);
1014 const local_ordinal_type p_n_sublines_2 = ceil(double(n_subparts_per_part - 1) * num_parts / num_teams);
1015
1016 const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1017 const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part - 1) * 2, block_size);
1018 const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length, block_size);
1019 const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1020
1021 if (n_subparts_per_part == 1) {
1022 return p_costApplyAinv;
1023 }
1024 return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1025}
1026
1027template <typename local_ordinal_type>
1028local_ordinal_type getAutomaticNSubparts(const local_ordinal_type num_parts,
1029 const local_ordinal_type num_teams,
1030 const local_ordinal_type line_length,
1031 const local_ordinal_type block_size) {
1032 // BMK: replaced theoretical model with empirical model
1033 // This is a linear regression based on data from a grid search.
1034 // The independent terms in the regression are:
1035 // - "parallelism surplus" - smaller when problem has enough lines to saturate GPU, larger otherwise
1036 // - log2 of the line length
1037 // - block size
1038 double parallelismSurplus = Kokkos::sqrt((double)num_teams / num_parts);
1039 double logLineLength = Kokkos::log2((double)line_length);
1040 (void)logLineLength;
1041 // Directly predict with linear model
1042#if defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU)
1043 // MI300-specific data
1044 double modeled = -9.2312 + 4.6946 * parallelismSurplus + 0.4095 * block_size + 0.966 * logLineLength;
1045 // Do not split lines if there is plenty of parallelism
1046 if (parallelismSurplus < 0.3)
1047 modeled = 1;
1048#elif defined(KOKKOS_ARCH_HOPPER) || defined(KOKKOS_ARCH_BLACKWELL)
1049 // Based on H100 data
1050 double modeled = -9.6053 + 4.7477 * parallelismSurplus + 0.2338 * block_size + 1.0794 * logLineLength;
1051 // On H100, performance degrades rapidly if small lines are split too many times
1052 double maxSplit = (double)line_length / 8;
1053 if (modeled > maxSplit)
1054 modeled = maxSplit;
1055#elif defined(KOKKOS_ENABLE_CUDA)
1056 // Based on V100 data, line splitting is profitable in fewer cases
1057 // (only when there are few, long lines)
1058 double modeled = 1;
1059 if (parallelismSurplus > 1 && line_length > 64)
1060 modeled = 4;
1061#elif defined(KOKKOS_ENABLE_HIP)
1062 // Based on MI250X data
1063 double modeled = -8.6214 + 7.3468 * parallelismSurplus + 0.3596 * block_size + 0.6673 * logLineLength;
1064#else
1065 // GPUs other than CUDA or HIP: default to simple model that works for V100
1066 double modeled = 1;
1067 if (parallelismSurplus > 1 && line_length > 64)
1068 modeled = 4;
1069#endif
1070
1071 // Round to nearest integer
1072 local_ordinal_type n_subparts_per_part = 0.5 + modeled;
1073 // Do not split lines if there is plenty of parallelism available
1074 if (parallelismSurplus < 0.3)
1075 n_subparts_per_part = 1;
1076 // Clamp the result to valid range
1077 // Criteria for valid n_subparts_per_part (where connection_length is 2 for wide separators)
1078 // line_length >= n_subparts_per_part + (n_subparts_per_part - 1) * connection_length
1079 // Equivalently:
1080 // line_length >= n_subparts_per_part + n_subparts_per_part * 2 - 2
1081 // line_length >= 3 * n_subparts_per_part - 2
1082 local_ordinal_type min_subparts_per_part = 1;
1083 local_ordinal_type max_subparts_per_part = (line_length + 2) / 3;
1084 // Limit memory usage from too many sublines
1085 if (max_subparts_per_part > 16)
1086 max_subparts_per_part = 16;
1087 if (n_subparts_per_part < min_subparts_per_part)
1088 n_subparts_per_part = min_subparts_per_part;
1089 if (n_subparts_per_part > max_subparts_per_part)
1090 n_subparts_per_part = max_subparts_per_part;
1091 return n_subparts_per_part;
1092}
1093
1094template <typename ArgActiveExecutionMemorySpace>
1095struct SolveTridiagsDefaultModeAndAlgo;
1096
1100template <typename MatrixType>
1101BlockHelperDetails::PartInterface<MatrixType>
1102createPartInterface(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1103 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1104 const Teuchos::Array<Teuchos::Array<typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type>> &partitions,
1105 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1106 IFPACK2_BLOCKHELPER_TIMER("createPartInterface", createPartInterface);
1108 using local_ordinal_type = typename impl_type::local_ordinal_type;
1109 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1110 using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
1111 using size_type = typename impl_type::size_type;
1112
1113 auto bA = Teuchos::rcp_dynamic_cast<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1114
1115 TEUCHOS_ASSERT(!bA.is_null() || G->getLocalNumRows() != 0);
1116 const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1117 constexpr int vector_length = impl_type::vector_length;
1118 constexpr int internal_vector_length = impl_type::internal_vector_length;
1119
1120 const auto comm = A->getRowMap()->getComm();
1121
1122 BlockHelperDetails::PartInterface<MatrixType> interf;
1123
1124 const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1125 const bool jacobi = partitions.size() == 0 || partitions.size() == A_n_lclrows;
1126 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1127
1128 typedef std::pair<local_ordinal_type, local_ordinal_type> size_idx_pair_type;
1129 std::vector<size_idx_pair_type> partsz(nparts);
1130
1131 if (!jacobi) {
1132 for (local_ordinal_type i = 0; i < nparts; ++i)
1133 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1134 std::sort(partsz.begin(), partsz.end(),
1135 [](const size_idx_pair_type &x, const size_idx_pair_type &y) {
1136 return x.first > y.first;
1137 });
1138 }
1139
1140 local_ordinal_type n_subparts_per_part;
1141 if (jacobi) {
1142 n_subparts_per_part = 1;
1143 } else {
1144 if (n_subparts_per_part_in == -1) {
1145 // If the number of subparts is set to -1, the user let the algorithm
1146 // decides the value automatically
1147 using execution_space = typename impl_type::execution_space;
1148
1149 // Line splitting only benefits GPUs
1150 if constexpr (impl_type::node_type::is_gpu) {
1151 const int line_length = partsz[0].first;
1152
1153 const local_ordinal_type team_size =
1154 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1155 recommended_team_size(blocksize, vector_length, internal_vector_length);
1156
1157 const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1158 n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1159#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1160 printf("Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1161#endif
1162 } else {
1163 n_subparts_per_part = 1;
1164#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1165 printf("Automatically chosen n_subparts_per_part = 1 for CPU backend\n");
1166#endif
1167 }
1168 } else {
1169 n_subparts_per_part = n_subparts_per_part_in;
1170 }
1171 }
1172
1173 // Total number of sub lines:
1174 const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1175 // Total number of sub lines + the Schur complement blocks.
1176 // For a given live 2 sub lines implies one Schur complement, 3 sub lines implies two Schur complements etc.
1177 const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part - 1);
1178
1179#if defined(BLOCKTRIDICONTAINER_DEBUG)
1180 local_ordinal_type nrows = 0;
1181 if (jacobi)
1182 nrows = nparts;
1183 else
1184 for (local_ordinal_type i = 0; i < nparts; ++i) nrows += partitions[i].size();
1185
1186 TEUCHOS_TEST_FOR_EXCEPT_MSG(nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1187 << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
1188#endif
1189
1190 // permutation vector
1191 std::vector<local_ordinal_type> p;
1192 if (jacobi) {
1193 interf.max_partsz = 1;
1194 interf.max_subpartsz = 0;
1195 interf.n_subparts_per_part = 1;
1196 interf.nparts = nparts;
1197 } else {
1198 // reorder parts to maximize simd packing efficiency
1199 p.resize(nparts);
1200
1201 for (local_ordinal_type i = 0; i < nparts; ++i)
1202 p[i] = partsz[i].second;
1203
1204 interf.max_partsz = partsz[0].first;
1205
1206 constexpr local_ordinal_type connection_length = 2;
1207 const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1208 const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1209
1210 interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1211 interf.n_subparts_per_part = n_subparts_per_part;
1212 interf.nparts = nparts;
1213 }
1214
1215 // allocate parts
1216 interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1217 interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1218 interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1219 interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1220 interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1221
1222 interf.part2rowidx0_sub = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1223 interf.part2packrowidx0_sub = local_ordinal_type_2d_view(do_not_initialize_tag("part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1224 interf.rowidx2part_sub = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1225
1226 interf.partptr_sub = local_ordinal_type_2d_view(do_not_initialize_tag("partptr_sub"), n_sub_parts_and_schur, 2);
1227
1228 // mirror to host and compute on host execution space
1229 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1230 const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1231
1232 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1233 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1234 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1235 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1236
1237 const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1238 const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1239 const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1240
1241 // Determine parts.
1242 interf.row_contiguous = true;
1243 partptr(0) = 0;
1244 part2rowidx0(0) = 0;
1245 part2packrowidx0(0) = 0;
1246 local_ordinal_type pack_nrows = 0;
1247 local_ordinal_type pack_nrows_sub = 0;
1248 if (jacobi) {
1249 IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)", Jacobi);
1250 // Jacobi (all lines have length 1) means that A_n_lclrows == nparts,
1251 // so the mapping between parts and rows is trivial.
1252 // Note: we can leave interf.row_contiguous = true, since for all i: lclrow(i) == i
1253 for (local_ordinal_type i = 0; i <= nparts; ++i) {
1254 part2rowidx0(i) = i;
1255 partptr(i) = i;
1256 }
1257 for (local_ordinal_type i = 0; i < nparts; ++i) {
1258 rowidx2part(i) = i;
1259 lclrow(i) = i;
1260 }
1261 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1262 // assume No overlap.
1263 if (ip % vector_length == 0) pack_nrows = 1;
1264 part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1265 }
1266 part2rowidx0_sub(0) = 0;
1267 partptr_sub(0, 0) = 0;
1268
1269 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1270 constexpr local_ordinal_type ipnrows = 1;
1271 const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1272
1273 TEUCHOS_TEST_FOR_EXCEPTION(full_line_length != ipnrows, std::logic_error,
1274 "In the part " << ip);
1275
1276 constexpr local_ordinal_type connection_length = 2;
1277
1278 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1279 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
1280 "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1281
1282 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1283 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1284
1285 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1286
1287 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1288 const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1289 const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1290 if (local_sub_ip != n_subparts_per_part - 1) {
1291 if (local_sub_ip != 0) {
1292 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1293 } else if (ip != 0) {
1294 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1295 }
1296 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1297 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1298 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1299
1300 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1301 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1302
1303#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1304 printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1305 printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1306#endif
1307 } else {
1308 if (local_sub_ip != 0) {
1309 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1310 } else if (ip != 0) {
1311 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1312 }
1313 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1314
1315 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1316
1317#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1318 printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1319#endif
1320 }
1321 }
1322 }
1323
1324#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1325 std::cout << "partptr_sub = " << std::endl;
1326 for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1327 for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1328 std::cout << partptr_sub(i, j) << " ";
1329 }
1330 std::cout << std::endl;
1331 }
1332 std::cout << "partptr_sub end" << std::endl;
1333#endif
1334
1335 {
1336 local_ordinal_type npacks = ceil(float(nparts) / vector_length);
1337
1338 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1339 for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1340 part2packrowidx0_sub(ip, 0) = 0;
1341 }
1342 for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1343 if (ipack != 0) {
1344 local_ordinal_type ip_min = ipack * vector_length;
1345 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1346 for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1347 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1348 }
1349 }
1350
1351 for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1352 local_ordinal_type ip_min = ipack * vector_length;
1353 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1354
1355 const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1356
1357 constexpr local_ordinal_type connection_length = 2;
1358
1359 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1360 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1361
1362 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1363 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1364 if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1365
1366 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1367
1368 for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1369 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1370 }
1371 }
1372 }
1373
1374 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1375 }
1376 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1377 } else {
1378 IFPACK2_BLOCKHELPER_TIMER("compute part indices", indices);
1379 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1380 const auto *part = &partitions[p[ip]];
1381 const local_ordinal_type ipnrows = part->size();
1382 TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip - 1]].size())));
1383 TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1384 BlockHelperDetails::get_msg_prefix(comm)
1385 << "partition " << p[ip]
1386 << " is empty, which is not allowed.");
1387 // assume No overlap.
1388 part2rowidx0(ip + 1) = part2rowidx0(ip) + ipnrows;
1389 // Since parts are ordered in decreasing size, the size of the first
1390 // part in a pack is the size for all parts in the pack.
1391 if (ip % vector_length == 0) pack_nrows = ipnrows;
1392 part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1393 const local_ordinal_type offset = partptr(ip);
1394 for (local_ordinal_type i = 0; i < ipnrows; ++i) {
1395 const auto lcl_row = (*part)[i];
1396 TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1397 BlockHelperDetails::get_msg_prefix(comm)
1398 << "partitions[" << p[ip] << "]["
1399 << i << "] = " << lcl_row
1400 << " but input matrix implies limits of [0, " << A_n_lclrows - 1
1401 << "].");
1402 lclrow(offset + i) = lcl_row;
1403 rowidx2part(offset + i) = ip;
1404 if (interf.row_contiguous && offset + i > 0 && lclrow((offset + i) - 1) + 1 != lcl_row)
1405 interf.row_contiguous = false;
1406 }
1407 partptr(ip + 1) = offset + ipnrows;
1408
1409#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1410 printf("Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1411 printf("partptr(%d+1) = %d\n", ip, partptr(ip + 1));
1412#endif
1413 }
1414
1415 part2rowidx0_sub(0) = 0;
1416 partptr_sub(0, 0) = 0;
1417 // const local_ordinal_type number_pack_per_sub_part = ceil(float(nparts)/vector_length);
1418
1419 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1420 const auto *part = &partitions[p[ip]];
1421 const local_ordinal_type ipnrows = part->size();
1422 const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1423
1424 TEUCHOS_TEST_FOR_EXCEPTION(full_line_length != ipnrows, std::logic_error,
1425 "In the part " << ip);
1426
1427 constexpr local_ordinal_type connection_length = 2;
1428
1429 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1430 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
1431 "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1432
1433 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1434 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1435
1436 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1437
1438 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1439 const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1440 const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1441 if (local_sub_ip != n_subparts_per_part - 1) {
1442 if (local_sub_ip != 0) {
1443 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1444 } else if (ip != 0) {
1445 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1446 }
1447 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1448 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1449 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1450
1451 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1452 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1453
1454#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1455 printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1456 printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1457#endif
1458 } else {
1459 if (local_sub_ip != 0) {
1460 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1461 } else if (ip != 0) {
1462 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1463 }
1464 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1465
1466 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1467
1468#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1469 printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1470#endif
1471 }
1472 }
1473 }
1474
1475 {
1476 local_ordinal_type npacks = ceil(float(nparts) / vector_length);
1477
1478 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1479 for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1480 part2packrowidx0_sub(ip, 0) = 0;
1481 }
1482 for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1483 if (ipack != 0) {
1484 local_ordinal_type ip_min = ipack * vector_length;
1485 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1486 for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1487 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1488 }
1489 }
1490
1491 for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1492 local_ordinal_type ip_min = ipack * vector_length;
1493 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1494
1495 const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1496
1497 constexpr local_ordinal_type connection_length = 2;
1498
1499 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1500 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1501
1502 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1503 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1504 if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1505
1506 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1507
1508 for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1509 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1510 }
1511 }
1512 }
1513
1514 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1515 }
1516 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1517 }
1518#if defined(BLOCKTRIDICONTAINER_DEBUG)
1519 TEUCHOS_ASSERT(partptr(nparts) == nrows);
1520#endif
1521 if (lclrow(0) != 0) interf.row_contiguous = false;
1522
1523 Kokkos::deep_copy(interf.partptr, partptr);
1524 Kokkos::deep_copy(interf.lclrow, lclrow);
1525
1526 Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1527
1528 // assume No overlap. Thus:
1529 interf.part2rowidx0 = interf.partptr;
1530 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1531
1532 interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1533 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1534
1535 { // Fill packptr.
1536 IFPACK2_BLOCKHELPER_TIMER("Fill packptr", packptr0);
1537 local_ordinal_type npacks = ceil(float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1538 npacks = 0;
1539 for (local_ordinal_type ip = 1; ip <= nparts; ++ip) // n_sub_parts_and_schur
1540 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1541 ++npacks;
1542
1543 interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1544 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1545 packptr(0) = 0;
1546 for (local_ordinal_type ip = 1, k = 1; ip <= nparts; ++ip)
1547 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1548 packptr(k++) = ip;
1549
1550 Kokkos::deep_copy(interf.packptr, packptr);
1551
1552 local_ordinal_type npacks_per_subpart = ceil(float(nparts) / vector_length);
1553 npacks = ceil(float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1554
1555 interf.packindices_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_sub"), npacks_per_subpart * n_subparts_per_part);
1556 interf.packindices_schur = local_ordinal_type_2d_view(do_not_initialize_tag("packindices_schur"), npacks_per_subpart, n_subparts_per_part - 1);
1557
1558 const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1559 const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1560
1561 // Fill packindices_sub and packindices_schur
1562 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part - 1; ++local_sub_ip) {
1563 for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1564 packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1565 packindices_schur(local_pack_ip, local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1566 }
1567 }
1568
1569 for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1570 packindices_sub((n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip;
1571 }
1572
1573#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1574 std::cout << "packindices_sub = " << std::endl;
1575 for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1576 std::cout << packindices_sub(i) << " ";
1577 }
1578 std::cout << std::endl;
1579 std::cout << "packindices_sub end" << std::endl;
1580
1581 std::cout << "packindices_schur = " << std::endl;
1582 for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1583 for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1584 std::cout << packindices_schur(i, j) << " ";
1585 }
1586 std::cout << std::endl;
1587 }
1588
1589 std::cout << "packindices_schur end" << std::endl;
1590#endif
1591
1592 Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1593 Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1594
1595 interf.packptr_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1596 const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1597 packptr_sub(0) = 0;
1598 for (local_ordinal_type k = 0; k < npacks + 1; ++k)
1599 packptr_sub(k) = packptr(k % npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1600
1601 Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1602 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1603 }
1604 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1605
1606 return interf;
1607}
1608
1612template <typename MatrixType>
1615 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1616 using size_type_1d_view = typename impl_type::size_type_1d_view;
1617 using size_type_2d_view = typename impl_type::size_type_2d_view;
1618 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1619 using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1620 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1621 using internal_vector_type_3d_view = typename impl_type::internal_vector_type_3d_view;
1622
1623 // flat_td_ptr(i) is the index into flat-array values of the start of the
1624 // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1625 // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1626 // vector_length is the position in the pack.
1627 size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1628 // List of local column indices into A from which to grab
1629 // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1630 local_ordinal_type_1d_view A_colindsub;
1631 // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1632 // tridiag's pack, and i % vector_length gives the position in the pack.
1633 vector_type_3d_view values;
1634 // Schur block values. pack_td_ptr_schur(i) points to the start of the i'th
1635 // Schur's pack, and i % vector_length gives the position in the pack.
1636 vector_type_3d_view values_schur;
1637 // inv(A_00)*A_01 block values.
1638 vector_type_4d_view e_values;
1639 // If doing Schur line splitting: space for permuted version of X,
1640 // to be used during the Schur complement block solves (SolveTridiags, SingleVectorSchurTag).
1641 // Otherwise, this is not allocated.
1642 internal_vector_type_3d_view X_internal_vector_values_schur;
1643
1644 // The following are for fused block Jacobi only.
1645 // For block row i, diag_offset(i)...diag_offset(i + bs^2)
1646 // is the range of scalars for the diagonal block.
1647 size_type_1d_view diag_offsets;
1648 // For fused residual+solve block Jacobi case,
1649 // this contains the diagonal block inverses in flat, local row indexing:
1650 // d_inv(row, :, :) gives the row-major block for row.
1651 btdm_scalar_type_3d_view d_inv;
1652
1653 bool is_diagonal_only;
1654
1655 BlockTridiags() = default;
1656 BlockTridiags(const BlockTridiags &b) = default;
1657
1658 // Index into row-major block of a tridiag.
1659 template <typename idx_type>
1660 static KOKKOS_FORCEINLINE_FUNCTION
1661 idx_type
1662 IndexToRow(const idx_type &ind) { return (ind + 1) / 3; }
1663 // Given a row of a row-major tridiag, return the index of the first block
1664 // in that row.
1665 template <typename idx_type>
1666 static KOKKOS_FORCEINLINE_FUNCTION
1667 idx_type
1668 RowToIndex(const idx_type &row) { return row > 0 ? 3 * row - 1 : 0; }
1669 // Number of blocks in a tridiag having a given number of rows.
1670 template <typename idx_type>
1671 static KOKKOS_FORCEINLINE_FUNCTION
1672 idx_type
1673 NumBlocks(const idx_type &nrows) { return nrows > 0 ? 3 * nrows - 2 : 0; }
1674 // Number of blocks associated to a Schur complement having a given number of rows.
1675 template <typename idx_type>
1676 static KOKKOS_FORCEINLINE_FUNCTION
1677 idx_type
1678 NumBlocksSchur(const idx_type &nrows) { return nrows > 0 ? 3 * nrows + 2 : 0; }
1679};
1680
1684template <typename MatrixType>
1686createBlockTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf) {
1687 IFPACK2_BLOCKHELPER_TIMER("createBlockTridiags", createBlockTridiags0);
1689 using execution_space = typename impl_type::execution_space;
1690 using local_ordinal_type = typename impl_type::local_ordinal_type;
1691 using size_type = typename impl_type::size_type;
1692 using size_type_2d_view = typename impl_type::size_type_2d_view;
1693
1694 constexpr int vector_length = impl_type::vector_length;
1695
1697
1698 const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1699
1700 { // construct the flat index pointers into the tridiag values array.
1701 btdm.flat_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.flat_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1702 const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part);
1703 Kokkos::parallel_scan(
1704 "createBlockTridiags::RangePolicy::flat_td_ptr",
1705 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1706 const local_ordinal_type partidx = i / (2 * interf.n_subparts_per_part);
1707 const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1708
1709 if (final) {
1710 btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1711 }
1712 if (local_subpartidx != (2 * interf.n_subparts_per_part - 1)) {
1713 const local_ordinal_type nrows = interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 1) - interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 0);
1714 if (local_subpartidx % 2 == 0)
1715 update += btdm.NumBlocks(nrows);
1716 else
1717 update += btdm.NumBlocksSchur(nrows);
1718 }
1719 });
1720
1721 const auto nblocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts - 1, 2 * interf.n_subparts_per_part - 1));
1722 btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1723 }
1724
1725 // And the packed index pointers.
1726 if (vector_length == 1) {
1727 btdm.pack_td_ptr = btdm.flat_td_ptr;
1728 } else {
1729 // const local_ordinal_type npacks = interf.packptr_sub.extent(0) - 1;
1730
1731 local_ordinal_type npacks_per_subpart = 0;
1732 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1733 Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1734 for (local_ordinal_type ip = 1; ip <= interf.nparts; ++ip) // n_sub_parts_and_schur
1735 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1736 ++npacks_per_subpart;
1737
1738 btdm.pack_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1739 const Kokkos::RangePolicy<execution_space> policy(0, npacks_per_subpart);
1740
1741 Kokkos::parallel_for(
1742 "createBlockTridiags::RangePolicy::pack_td_ptr",
1743 policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1744 for (local_ordinal_type j = 0; j < 2 * interf.n_subparts_per_part; ++j) {
1745 const local_ordinal_type pack_id = (j == 2 * interf.n_subparts_per_part - 1) ? i + (j - 1) * npacks_per_subpart : i + j * npacks_per_subpart;
1746 const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id + 1) - interf.packptr_sub(pack_id);
1747
1748 const local_ordinal_type parti = interf.packptr_sub(pack_id);
1749 const local_ordinal_type partidx = parti % interf.nparts;
1750
1751 for (local_ordinal_type pti = 0; pti < nparts_in_pack; ++pti) {
1752 btdm.pack_td_ptr(partidx + pti, j) = btdm.flat_td_ptr(i, j);
1753 }
1754 }
1755 });
1756 }
1757
1758 btdm.pack_td_ptr_schur = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1759
1760 const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1761 constexpr local_ordinal_type connection_length = 2;
1762
1763 host_pack_td_ptr_schur(0, 0) = 0;
1764 for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1765 if (i % vector_length == 0) {
1766 if (i != 0)
1767 host_pack_td_ptr_schur(i, 0) = host_pack_td_ptr_schur(i - 1, host_pack_td_ptr_schur.extent(1) - 1);
1768 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part - 1; ++j) {
1769 host_pack_td_ptr_schur(i, j + 1) = host_pack_td_ptr_schur(i, j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part - 2 ? 1 : 0);
1770 }
1771 } else {
1772 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1773 host_pack_td_ptr_schur(i, j) = host_pack_td_ptr_schur(i - 1, j);
1774 }
1775 }
1776 }
1777
1778 Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1779
1780#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1781 const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1782 std::cout << "flat_td_ptr = " << std::endl;
1783 for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1784 for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1785 std::cout << host_flat_td_ptr(i, j) << " ";
1786 }
1787 std::cout << std::endl;
1788 }
1789 std::cout << "flat_td_ptr end" << std::endl;
1790
1791 const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1792
1793 std::cout << "pack_td_ptr = " << std::endl;
1794 for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1795 for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1796 std::cout << host_pack_td_ptr(i, j) << " ";
1797 }
1798 std::cout << std::endl;
1799 }
1800 std::cout << "pack_td_ptr end" << std::endl;
1801
1802 std::cout << "pack_td_ptr_schur = " << std::endl;
1803 for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1804 for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1805 std::cout << host_pack_td_ptr_schur(i, j) << " ";
1806 }
1807 std::cout << std::endl;
1808 }
1809 std::cout << "pack_td_ptr_schur end" << std::endl;
1810#endif
1811
1812 // values and A_colindsub are created in the symbolic phase
1813 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1814
1815 return btdm;
1816}
1817
1818// Set the tridiags to be I to the full pack block size. That way, if a
1819// tridiag within a pack is shorter than the longest one, the extra blocks are
1820// processed in a safe way. Similarly, in the solve phase, if the extra blocks
1821// in the packed multvector are 0, and the tridiag LU reflects the extra I
1822// blocks, then the solve proceeds as though the extra blocks aren't
1823// present. Since this extra work is part of the SIMD calls, it's not actually
1824// extra work. Instead, it means we don't have to put checks or masks in, or
1825// quiet NaNs. This functor has to be called just once, in the symbolic phase,
1826// since the numeric phase fills in only the used entries, leaving these I
1827// blocks intact.
1828template <typename MatrixType>
1829void setTridiagsToIdentity(const BlockTridiags<MatrixType> &btdm,
1830 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view &packptr) {
1832 using execution_space = typename impl_type::execution_space;
1833 using local_ordinal_type = typename impl_type::local_ordinal_type;
1834 using size_type_2d_view = typename impl_type::size_type_2d_view;
1835
1836 const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1837 const local_ordinal_type blocksize = btdm.values.extent(1);
1838
1839 {
1840 const int vector_length = impl_type::vector_length;
1841 const int internal_vector_length = impl_type::internal_vector_length;
1842
1843 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1844 using internal_vector_type = typename impl_type::internal_vector_type;
1845 using internal_vector_type_4d_view =
1846 typename impl_type::internal_vector_type_4d_view;
1847
1848 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1849 const internal_vector_type_4d_view values(reinterpret_cast<internal_vector_type *>(btdm.values.data()),
1850 btdm.values.extent(0),
1851 btdm.values.extent(1),
1852 btdm.values.extent(2),
1853 vector_length / internal_vector_length);
1854 const local_ordinal_type vector_loop_size = values.extent(3);
1855#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1856 local_ordinal_type total_team_size(0);
1857 if (blocksize <= 5)
1858 total_team_size = 32;
1859 else if (blocksize <= 9)
1860 total_team_size = 64;
1861 else if (blocksize <= 12)
1862 total_team_size = 96;
1863 else if (blocksize <= 16)
1864 total_team_size = 128;
1865 else if (blocksize <= 20)
1866 total_team_size = 160;
1867 else
1868 total_team_size = 160;
1869 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1870 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1871#elif defined(KOKKOS_ENABLE_HIP)
1872 // FIXME: HIP
1873 // These settings might be completely wrong
1874 // will have to do some experiments to decide
1875 // what makes sense on AMD GPUs
1876 local_ordinal_type total_team_size(0);
1877 if (blocksize <= 5)
1878 total_team_size = 32;
1879 else if (blocksize <= 9)
1880 total_team_size = 64;
1881 else if (blocksize <= 12)
1882 total_team_size = 96;
1883 else if (blocksize <= 16)
1884 total_team_size = 128;
1885 else if (blocksize <= 20)
1886 total_team_size = 160;
1887 else
1888 total_team_size = 160;
1889 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1890 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1891#elif defined(KOKKOS_ENABLE_SYCL)
1892 // SYCL: FIXME
1893 local_ordinal_type total_team_size(0);
1894 if (blocksize <= 5)
1895 total_team_size = 32;
1896 else if (blocksize <= 9)
1897 total_team_size = 64;
1898 else if (blocksize <= 12)
1899 total_team_size = 96;
1900 else if (blocksize <= 16)
1901 total_team_size = 128;
1902 else if (blocksize <= 20)
1903 total_team_size = 160;
1904 else
1905 total_team_size = 160;
1906 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1907 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1908#else
1909 // Host architecture: team size is always one
1910 const team_policy_type policy(packptr.extent(0) - 1, 1, 1);
1911#endif
1912 Kokkos::parallel_for(
1913 "setTridiagsToIdentity::TeamPolicy",
1914 policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1915 const local_ordinal_type k = member.league_rank();
1916 const local_ordinal_type ibeg = pack_td_ptr(packptr(k), 0);
1917 const local_ordinal_type iend = pack_td_ptr(packptr(k), pack_td_ptr.extent(1) - 1);
1918
1919 const local_ordinal_type diff = iend - ibeg;
1920 const local_ordinal_type icount = diff / 3 + (diff % 3 > 0);
1921 const btdm_scalar_type one(1);
1922 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
1923 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, icount), [&](const local_ordinal_type &ii) {
1924 const local_ordinal_type i = ibeg + ii * 3;
1925 for (local_ordinal_type j = 0; j < blocksize; ++j) {
1926 values(i, j, j, v) = one;
1927 }
1928 });
1929 });
1930 });
1931 }
1932}
1933
1937template <typename MatrixType>
1938void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1939 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1940 const BlockHelperDetails::PartInterface<MatrixType> &interf,
1943 const bool overlap_communication_and_computation,
1944 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
1945 bool useSeqMethod,
1946 bool use_fused_jacobi) {
1947 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase", SymbolicPhase);
1948
1950
1951 using execution_space = typename impl_type::execution_space;
1952 using host_execution_space = typename impl_type::host_execution_space;
1953
1954 using local_ordinal_type = typename impl_type::local_ordinal_type;
1955 using global_ordinal_type = typename impl_type::global_ordinal_type;
1956 using size_type = typename impl_type::size_type;
1957 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1958 using size_type_1d_view = typename impl_type::size_type_1d_view;
1959 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1960 using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1961 using internal_vector_type_3d_view = typename impl_type::internal_vector_type_3d_view;
1962 using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
1963 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1964 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1965
1966 constexpr int vector_length = impl_type::vector_length;
1967 constexpr int internal_vector_length = impl_type::internal_vector_length;
1968
1969 const auto comm = A->getRowMap()->getComm();
1970
1971 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
1972 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
1973
1974 bool hasBlockCrsMatrix = !A_bcrs.is_null();
1975 TEUCHOS_ASSERT(hasBlockCrsMatrix || g->getLocalNumRows() != 0);
1976 const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows() / g->getLocalNumRows();
1977
1978 // mirroring to host
1979 const auto partptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.partptr);
1980 const auto lclrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.lclrow);
1981 const auto rowidx2part = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.rowidx2part);
1982 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1983 const auto packptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.packptr);
1984
1985 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1986
1987 Kokkos::View<local_ordinal_type *, host_execution_space> col2row("col2row", A->getLocalNumCols());
1988
1989 // find column to row map on host
1990
1991 Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1992 {
1993 const auto rowmap = g->getRowMap();
1994 const auto colmap = g->getColMap();
1995 const auto dommap = g->getDomainMap();
1996 TEUCHOS_ASSERT(!(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1997 rowmap->lazyPushToHost();
1998 colmap->lazyPushToHost();
1999 dommap->lazyPushToHost();
2000
2001#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
2002 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2003 Kokkos::parallel_for(
2004 "performSymbolicPhase::RangePolicy::col2row",
2005 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
2006 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
2007 TEUCHOS_ASSERT(gid != Teuchos::OrdinalTraits<global_ordinal_type>::invalid());
2008 if (dommap->isNodeGlobalElement(gid)) {
2009 const local_ordinal_type lc = colmap->getLocalElement(gid);
2010#if defined(BLOCKTRIDICONTAINER_DEBUG)
2011 TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits<local_ordinal_type>::invalid(),
2012 BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid
2013 << " gives an invalid local column.");
2014#endif
2015 col2row(lc) = lr;
2016 }
2017 });
2018#endif
2019 }
2020
2021 // construct the D and R graphs in A = D + R.
2022 {
2023 const auto local_graph = g->getLocalGraphHost();
2024 const auto local_graph_rowptr = local_graph.row_map;
2025 TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
2026 const auto local_graph_colidx = local_graph.entries;
2027
2028 // assume no overlap.
2029
2030 Kokkos::View<local_ordinal_type *, host_execution_space> lclrow2idx("lclrow2idx", nrows);
2031 {
2032 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2033 Kokkos::parallel_for(
2034 "performSymbolicPhase::RangePolicy::lclrow2idx",
2035 policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
2036 lclrow2idx[lclrow(i)] = i;
2037 });
2038 }
2039
2040 // count (block) nnzs in D and R.
2042 typename sum_reducer_type::value_type sum_reducer_value;
2043 {
2044 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2045 Kokkos::parallel_reduce
2046 // profiling interface does not work
2047 ( //"performSymbolicPhase::RangePolicy::count_nnz",
2048 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
2049 // LID -> index.
2050 const local_ordinal_type ri0 = lclrow2idx[lr];
2051 const local_ordinal_type pi0 = rowidx2part(ri0);
2052 for (size_type j = local_graph_rowptr(lr); j < local_graph_rowptr(lr + 1); ++j) {
2053 const local_ordinal_type lc = local_graph_colidx(j);
2054 const local_ordinal_type lc2r = col2row[lc];
2055 bool incr_R = false;
2056 do { // breakable
2057 if (lc2r == (local_ordinal_type)-1) {
2058 incr_R = true;
2059 break;
2060 }
2061 const local_ordinal_type ri = lclrow2idx[lc2r];
2062 const local_ordinal_type pi = rowidx2part(ri);
2063 if (pi != pi0) {
2064 incr_R = true;
2065 break;
2066 }
2067 // Test for being in the tridiag. This is done in index space. In
2068 // LID space, tridiag LIDs in a row are not necessarily related by
2069 // {-1, 0, 1}.
2070 if (ri0 + 1 >= ri && ri0 <= ri + 1)
2071 ++update.v[0]; // D_nnz
2072 else
2073 incr_R = true;
2074 } while (0);
2075 if (incr_R) {
2076 if (lc < nrows)
2077 ++update.v[1]; // R_nnz_owned
2078 else
2079 ++update.v[2]; // R_nnz_remote
2080 }
2081 }
2082 },
2083 sum_reducer_type(sum_reducer_value));
2084 }
2085 size_type D_nnz = sum_reducer_value.v[0];
2086 size_type R_nnz_owned = sum_reducer_value.v[1];
2087 size_type R_nnz_remote = sum_reducer_value.v[2];
2088
2089 if (!overlap_communication_and_computation) {
2090 R_nnz_owned += R_nnz_remote;
2091 R_nnz_remote = 0;
2092 }
2093
2094 // construct the D_00 graph.
2095 {
2096 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
2097
2098 btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
2099 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
2100
2101#if defined(BLOCKTRIDICONTAINER_DEBUG)
2102 Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2103#endif
2104
2105 const local_ordinal_type nparts = partptr.extent(0) - 1;
2106
2107 {
2108 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
2109 Kokkos::parallel_for(
2110 "performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
2111 policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
2112 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2113 local_ordinal_type offset = 0;
2114 for (local_ordinal_type ri0 = partptr(pi0); ri0 < partptr(pi0 + 1); ++ri0) {
2115 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2116 offset = 1;
2117 const local_ordinal_type lr0 = lclrow(ri0);
2118 const size_type j0 = local_graph_rowptr(lr0);
2119 for (size_type j = j0; j < local_graph_rowptr(lr0 + 1); ++j) {
2120 const local_ordinal_type lc = local_graph_colidx(j);
2121 const local_ordinal_type lc2r = col2row[lc];
2122 if (lc2r == (local_ordinal_type)-1) continue;
2123 const local_ordinal_type ri = lclrow2idx[lc2r];
2124 const local_ordinal_type pi = rowidx2part(ri);
2125 if (pi != pi0) continue;
2126 if (ri + 1 < ri0 || ri > ri0 + 1) continue;
2127 const local_ordinal_type row_entry = j - j0;
2128 D_A_colindsub(flat_td_ptr(pi0, 0) + ((td_row_os + ri) - ri0)) = row_entry;
2129 }
2130 }
2131 });
2132 }
2133#if defined(BLOCKTRIDICONTAINER_DEBUG)
2134 for (size_t i = 0; i < D_A_colindsub.extent(0); ++i)
2135 TEUCHOS_ASSERT(D_A_colindsub(i) != Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2136#endif
2137 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2138
2139 // Allocate values.
2140 {
2141 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0) - 1, btdm.pack_td_ptr.extent(1) - 1);
2142 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2143 btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
2144
2145 if (interf.n_subparts_per_part > 1) {
2146 const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0) - 1, btdm.pack_td_ptr_schur.extent(1) - 1);
2147 const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2148 btdm.values_schur = vector_type_3d_view("btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2149 }
2150
2151 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2152 }
2153 }
2154
2155 // Construct the R graph.
2156 {
2157 amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
2158 amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
2159
2160 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2161 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2162
2163 amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2164 amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
2165
2166 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2167 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2168
2169 {
2170 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2171 Kokkos::parallel_for(
2172 "performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2173 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
2174 const local_ordinal_type ri0 = lclrow2idx[lr];
2175 const local_ordinal_type pi0 = rowidx2part(ri0);
2176 const size_type j0 = local_graph_rowptr(lr);
2177 for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2178 const local_ordinal_type lc = local_graph_colidx(j);
2179 const local_ordinal_type lc2r = col2row[lc];
2180 if (lc2r != (local_ordinal_type)-1) {
2181 const local_ordinal_type ri = lclrow2idx[lc2r];
2182 const local_ordinal_type pi = rowidx2part(ri);
2183 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2184 continue;
2185 }
2186 }
2187 // exclusive scan will be performed later
2188 if (!overlap_communication_and_computation || lc < nrows) {
2189 ++R_rowptr(lr);
2190 } else {
2191 ++R_rowptr_remote(lr);
2192 }
2193 }
2194 });
2195 }
2196
2197 // exclusive scan
2199 {
2200 Kokkos::RangePolicy<host_execution_space> policy(0, nrows + 1);
2201 Kokkos::parallel_scan(
2202 "performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2203 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, update_type &update, const bool &final) {
2204 update_type val;
2205 val.v[0] = R_rowptr(lr);
2206 if (overlap_communication_and_computation)
2207 val.v[1] = R_rowptr_remote(lr);
2208
2209 if (final) {
2210 R_rowptr(lr) = update.v[0];
2211 if (overlap_communication_and_computation)
2212 R_rowptr_remote(lr) = update.v[1];
2213
2214 if (lr < nrows) {
2215 const local_ordinal_type ri0 = lclrow2idx[lr];
2216 const local_ordinal_type pi0 = rowidx2part(ri0);
2217
2218 size_type cnt_rowptr = R_rowptr(lr);
2219 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
2220
2221 const size_type j0 = local_graph_rowptr(lr);
2222 for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2223 const local_ordinal_type lc = local_graph_colidx(j);
2224 const local_ordinal_type lc2r = col2row[lc];
2225 if (lc2r != (local_ordinal_type)-1) {
2226 const local_ordinal_type ri = lclrow2idx[lc2r];
2227 const local_ordinal_type pi = rowidx2part(ri);
2228 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2229 continue;
2230 }
2231 const local_ordinal_type row_entry = j - j0;
2232 if (!overlap_communication_and_computation || lc < nrows)
2233 R_A_colindsub(cnt_rowptr++) = row_entry;
2234 else
2235 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2236 }
2237 }
2238 }
2239 update += val;
2240 });
2241 }
2242 TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
2243 Kokkos::deep_copy(amd.rowptr, R_rowptr);
2244 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2245 if (overlap_communication_and_computation) {
2246 TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
2247 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2248 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2249 }
2250
2251 // Allocate or view values.
2252 if (hasBlockCrsMatrix)
2253 amd.tpetra_values = (const_cast<block_crs_matrix_type *>(A_bcrs.get())->getValuesDeviceNonConst());
2254 else {
2255 amd.tpetra_values = (const_cast<crs_matrix_type *>(A_crs.get()))->getLocalValuesDevice(Tpetra::Access::ReadWrite);
2256 }
2257 }
2258
2259 if (interf.n_subparts_per_part > 1) {
2260 // If doing Schur complement line splitting, allocate E and space for permuted X
2261 btdm.e_values = vector_type_4d_view("btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2262 btdm.X_internal_vector_values_schur = internal_vector_type_3d_view(
2263 do_not_initialize_tag("X_internal_vector_values_schur"),
2264 2 * (interf.n_subparts_per_part - 1) * interf.part2packrowidx0_sub.extent(0),
2265 blocksize,
2266 vector_length / internal_vector_length);
2267 }
2268 }
2269 // Precompute offsets of each A and x entry to speed up residual.
2270 // Applies if all of these are true:
2271 // - hasBlockCrsMatrix
2272 // - execution_space is a GPU
2273 // - !useSeqMethod (since this uses a different scheme for indexing A,x)
2274 //
2275 // Reading A, x take up to 4 and 6 levels of indirection respectively,
2276 // but precomputing the offsets reduces it to 2 for both (get index, then value)
2277 if (BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix) {
2278 bool is_async_importer_active = !async_importer.is_null();
2279 local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2280 bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2281 BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2282 }
2283
2284 // If using fused block Jacobi path, allocate diagonal inverses here (d_inv) and find diagonal offsets.
2285 if (use_fused_jacobi) {
2286 btdm.d_inv = btdm_scalar_type_3d_view(do_not_initialize_tag("btdm.d_inv"), interf.nparts, blocksize, blocksize);
2287 auto rowptrs = A_bcrs->getCrsGraph().getLocalRowPtrsDevice();
2288 auto entries = A_bcrs->getCrsGraph().getLocalIndicesDevice();
2289 btdm.diag_offsets = BlockHelperDetails::findDiagOffsets<execution_space, size_type_1d_view>(rowptrs, entries, interf.nparts, blocksize);
2290 }
2291 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2292}
2293
2297template <typename ArgActiveExecutionMemorySpace>
2299
2300template <>
2301struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2302 typedef KB::Mode::Serial mode_type;
2303#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2304 typedef KB::Algo::Level3::CompactMKL algo_type;
2305#else
2306 typedef KB::Algo::Level3::Blocked algo_type;
2307#endif
2308 static int recommended_team_size(const int /* blksize */,
2309 const int /* vector_length */,
2310 const int /* internal_vector_length */) {
2311 return 1;
2312 }
2313};
2314
2315#if defined(KOKKOS_ENABLE_CUDA)
2316static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
2317 const int vector_length,
2318 const int internal_vector_length) {
2319 const int vector_size = vector_length / internal_vector_length;
2320 int total_team_size(0);
2321 if (blksize <= 5)
2322 total_team_size = 32;
2323 else if (blksize <= 9)
2324 total_team_size = 32; // 64
2325 else if (blksize <= 12)
2326 total_team_size = 96;
2327 else if (blksize <= 16)
2328 total_team_size = 128;
2329 else if (blksize <= 20)
2330 total_team_size = 160;
2331 else
2332 total_team_size = 160;
2333 return 2 * total_team_size / vector_size;
2334}
2335template <>
2336struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2337 typedef KB::Mode::Team mode_type;
2338 typedef KB::Algo::Level3::Unblocked algo_type;
2339 static int recommended_team_size(const int blksize,
2340 const int vector_length,
2341 const int internal_vector_length) {
2342 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2343 }
2344};
2345template <>
2346struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2347 typedef KB::Mode::Team mode_type;
2348 typedef KB::Algo::Level3::Unblocked algo_type;
2349 static int recommended_team_size(const int blksize,
2350 const int vector_length,
2351 const int internal_vector_length) {
2352 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2353 }
2354};
2355#endif
2356
2357#if defined(KOKKOS_ENABLE_HIP)
2358static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
2359 const int vector_length,
2360 const int internal_vector_length) {
2361 const int vector_size = vector_length / internal_vector_length;
2362 int total_team_size(0);
2363 if (blksize <= 5)
2364 total_team_size = 32;
2365 else if (blksize <= 9)
2366 total_team_size = 32; // 64
2367 else if (blksize <= 12)
2368 total_team_size = 96;
2369 else if (blksize <= 16)
2370 total_team_size = 128;
2371 else if (blksize <= 20)
2372 total_team_size = 160;
2373 else
2374 total_team_size = 160;
2375 return 2 * total_team_size / vector_size;
2376}
2377template <>
2378struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2379 typedef KB::Mode::Team mode_type;
2380 typedef KB::Algo::Level3::Unblocked algo_type;
2381 static int recommended_team_size(const int blksize,
2382 const int vector_length,
2383 const int internal_vector_length) {
2384 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2385 }
2386};
2387template <>
2388struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2389 typedef KB::Mode::Team mode_type;
2390 typedef KB::Algo::Level3::Unblocked algo_type;
2391 static int recommended_team_size(const int blksize,
2392 const int vector_length,
2393 const int internal_vector_length) {
2394 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2395 }
2396};
2397#endif
2398
2399#if defined(KOKKOS_ENABLE_SYCL)
2400static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(const int blksize,
2401 const int vector_length,
2402 const int internal_vector_length) {
2403 const int vector_size = vector_length / internal_vector_length;
2404 int total_team_size(0);
2405 if (blksize <= 5)
2406 total_team_size = 32;
2407 else if (blksize <= 9)
2408 total_team_size = 32; // 64
2409 else if (blksize <= 12)
2410 total_team_size = 96;
2411 else if (blksize <= 16)
2412 total_team_size = 128;
2413 else if (blksize <= 20)
2414 total_team_size = 160;
2415 else
2416 total_team_size = 160;
2417 return 2 * total_team_size / vector_size;
2418}
2419template <>
2420struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2421 typedef KB::Mode::Team mode_type;
2422 typedef KB::Algo::Level3::Unblocked algo_type;
2423 static int recommended_team_size(const int blksize,
2424 const int vector_length,
2425 const int internal_vector_length) {
2426 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2427 }
2428};
2429template <>
2430struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2431 typedef KB::Mode::Team mode_type;
2432 typedef KB::Algo::Level3::Unblocked algo_type;
2433 static int recommended_team_size(const int blksize,
2434 const int vector_length,
2435 const int internal_vector_length) {
2436 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2437 }
2438};
2439#endif
2440
2441template <typename impl_type, typename WWViewType>
2442KOKKOS_INLINE_FUNCTION void
2443solveMultiVector(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2444 const typename impl_type::local_ordinal_type & /* blocksize */,
2445 const typename impl_type::local_ordinal_type &i0,
2446 const typename impl_type::local_ordinal_type &r0,
2447 const typename impl_type::local_ordinal_type &nrows,
2448 const typename impl_type::local_ordinal_type &v,
2449 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2450 const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2451 const WWViewType &WW,
2452 const bool skip_first_pass = false) {
2453 using execution_space = typename impl_type::execution_space;
2454 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2455 using member_type = typename team_policy_type::member_type;
2456 using local_ordinal_type = typename impl_type::local_ordinal_type;
2457
2458 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2459
2460 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2461 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2462
2463 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2464
2465 // constant
2466#if KOKKOS_VERSION >= 40799
2467 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
2468#else
2469 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2470#endif
2471#if KOKKOS_VERSION >= 40799
2472 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
2473#else
2474 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2475#endif
2476
2477 // subview pattern
2478 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2479 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2480 auto X2 = X1;
2481
2482 local_ordinal_type i = i0, r = r0;
2483
2484 if (nrows > 1) {
2485 // solve Lx = x
2486 if (skip_first_pass) {
2487 i += (nrows - 2) * 3;
2488 r += (nrows - 2);
2489 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2490 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2491 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2492 KB::Trsm<member_type,
2493 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2494 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2495 X1.assign_data(X2.data());
2496 i += 3;
2497 } else {
2498 KB::Trsm<member_type,
2499 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2500 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2501 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
2502 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2503 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2504 member.team_barrier();
2505 KB::Gemm<member_type,
2506 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2507 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2508 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2509 KB::Trsm<member_type,
2510 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2511 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2512 X1.assign_data(X2.data());
2513 }
2514 }
2515
2516 // solve Ux = x
2517 KB::Trsm<member_type,
2518 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2519 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2520 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2521 i -= 3;
2522 A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
2523 X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
2524 member.team_barrier();
2525 KB::Gemm<member_type,
2526 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2527 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2528
2529 A.assign_data(&D_internal_vector_values(i, 0, 0, v));
2530 KB::Trsm<member_type,
2531 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2532 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2533 X1.assign_data(X2.data());
2534 }
2535 } else {
2536 // matrix is already inverted
2537 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2538 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
2539 member.team_barrier();
2540 KB::Gemm<member_type,
2541 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2542 default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
2543 }
2544}
2545
2546template <typename impl_type, typename WWViewType, typename XViewType>
2547KOKKOS_INLINE_FUNCTION void
2548solveSingleVectorNew(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2549 const typename impl_type::local_ordinal_type &blocksize,
2550 const typename impl_type::local_ordinal_type &i0,
2551 const typename impl_type::local_ordinal_type &r0,
2552 const typename impl_type::local_ordinal_type &nrows,
2553 const typename impl_type::local_ordinal_type &v,
2554 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2555 const XViewType &X_internal_vector_values, // Unmanaged<typename impl_type::internal_vector_type_4d_view>
2556 const WWViewType &WW) {
2557 using execution_space = typename impl_type::execution_space;
2558 // using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2559 // using member_type = typename team_policy_type::member_type;
2560 using local_ordinal_type = typename impl_type::local_ordinal_type;
2561
2562 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2563
2564 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2565 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2566
2567 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2568
2569 // base pointers
2570 auto A = D_internal_vector_values.data();
2571 auto X = X_internal_vector_values.data();
2572
2573 // constant
2574#if KOKKOS_VERSION >= 40799
2575 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
2576#else
2577 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2578#endif
2579#if KOKKOS_VERSION >= 40799
2580 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
2581#else
2582 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2583#endif
2584 // const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2585
2586 // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2587 const local_ordinal_type astep = D_internal_vector_values.stride(0);
2588 const local_ordinal_type as0 = D_internal_vector_values.stride(1); // blocksize*vector_length;
2589 const local_ordinal_type as1 = D_internal_vector_values.stride(2); // vector_length;
2590 const local_ordinal_type xstep = X_internal_vector_values.stride(0);
2591 const local_ordinal_type xs0 = X_internal_vector_values.stride(1); // vector_length;
2592
2593 // move to starting point
2594 A += i0 * astep + v;
2595 X += r0 * xstep + v;
2596
2597 // for (local_ordinal_type col=0;col<num_vectors;++col)
2598 if (nrows > 1) {
2599 // solve Lx = x
2600 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2601 member,
2602 KB::Diag::Unit,
2603 blocksize, blocksize,
2604 one,
2605 A, as0, as1,
2606 X, xs0);
2607
2608 for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
2609 member.team_barrier();
2610 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2611 member,
2612 blocksize, blocksize,
2613 -one,
2614 A + 2 * astep, as0, as1,
2615 X, xs0,
2616 one,
2617 X + 1 * xstep, xs0);
2618 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2619 member,
2620 KB::Diag::Unit,
2621 blocksize, blocksize,
2622 one,
2623 A + 3 * astep, as0, as1,
2624 X + 1 * xstep, xs0);
2625
2626 A += 3 * astep;
2627 X += 1 * xstep;
2628 }
2629
2630 // solve Ux = x
2631 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2632 member,
2633 KB::Diag::NonUnit,
2634 blocksize, blocksize,
2635 one,
2636 A, as0, as1,
2637 X, xs0);
2638
2639 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2640 A -= 3 * astep;
2641 member.team_barrier();
2642 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2643 member,
2644 blocksize, blocksize,
2645 -one,
2646 A + 1 * astep, as0, as1,
2647 X, xs0,
2648 one,
2649 X - 1 * xstep, xs0);
2650 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2651 member,
2652 KB::Diag::NonUnit,
2653 blocksize, blocksize,
2654 one,
2655 A, as0, as1,
2656 X - 1 * xstep, xs0);
2657 X -= 1 * xstep;
2658 }
2659 // for multiple rhs
2660 // X += xs1;
2661 } else {
2662 const local_ordinal_type ws0 = WW.stride(0);
2663 auto W = WW.data() + v;
2664 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
2665 member, blocksize, X, xs0, W, ws0);
2666 member.team_barrier();
2667 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2668 member,
2669 blocksize, blocksize,
2670 one,
2671 A, as0, as1,
2672 W, xs0,
2673 zero,
2674 X, xs0);
2675 }
2676}
2677
2678template <typename local_ordinal_type, typename ViewType>
2679void writeBTDValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2680#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2681 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2682 std::ofstream myfile;
2683 myfile.open(fileName);
2684
2685 const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type)scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2686 local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2687 const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2688 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2689
2690 const local_ordinal_type block_size = scalar_values.extent(1);
2691
2692 const local_ordinal_type n_rows_per_part = (n_blocks_per_part + 2) / 3 * block_size;
2693 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2694
2695 const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2696
2697 myfile << "%%MatrixMarket matrix coordinate real general" << std::endl;
2698 myfile << "%%nnz = " << nnz;
2699 myfile << " block size = " << block_size;
2700 myfile << " number of blocks = " << n_blocks;
2701 myfile << " number of parts = " << n_parts;
2702 myfile << " number of blocks per part = " << n_blocks_per_part;
2703 myfile << " number of rows = " << n_rows;
2704 myfile << " number of cols = " << n_rows;
2705 myfile << " number of packs = " << n_packs << std::endl;
2706
2707 myfile << n_rows << " " << n_rows << " " << nnz << std::setprecision(9) << std::endl;
2708
2709 local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2710 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2711 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2712 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2713 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2714 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2715 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2716 continue;
2717 if (i_block_in_part % 3 == 0) {
2718 current_row_offset = i_block_in_part / 3 * block_size;
2719 current_col_offset = i_block_in_part / 3 * block_size;
2720 } else if (i_block_in_part % 3 == 1) {
2721 current_row_offset = (i_block_in_part - 1) / 3 * block_size;
2722 current_col_offset = ((i_block_in_part - 1) / 3 + 1) * block_size;
2723 } else if (i_block_in_part % 3 == 2) {
2724 current_row_offset = ((i_block_in_part - 2) / 3 + 1) * block_size;
2725 current_col_offset = (i_block_in_part - 2) / 3 * block_size;
2726 }
2727 current_row_offset += current_part_idx * n_rows_per_part;
2728 current_col_offset += current_part_idx * n_rows_per_part;
2729 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2730 for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2731 current_row = current_row_offset + i_in_block + 1;
2732 current_col = current_col_offset + j_in_block + 1;
2733 myfile << current_row << " " << current_col << " " << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2734 }
2735 }
2736 }
2737 }
2738 }
2739
2740 myfile.close();
2741#endif
2742}
2743
2744template <typename local_ordinal_type, typename ViewType>
2745void write4DMultiVectorValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2746#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2747 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2748 std::ofstream myfile;
2749 myfile.open(fileName);
2750
2751 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2752 const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2753 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2754
2755 const local_ordinal_type block_size = scalar_values.extent(1);
2756 const local_ordinal_type n_cols = scalar_values.extent(2);
2757
2758 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2759 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2760
2761 const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2762
2763 myfile << "%%MatrixMarket matrix array real general" << std::endl;
2764 myfile << "%%block size = " << block_size;
2765 myfile << " number of blocks = " << n_blocks;
2766 myfile << " number of parts = " << n_parts;
2767 myfile << " number of blocks per part = " << n_blocks_per_part;
2768 myfile << " number of rows = " << n_rows;
2769 myfile << " number of cols = " << n_cols;
2770 myfile << " number of packs = " << n_packs << std::endl;
2771
2772 myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2773
2774 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2775 (void)current_row_offset;
2776 (void)current_part_idx;
2777 for (local_ordinal_type j_in_block = 0; j_in_block < n_cols; ++j_in_block) {
2778 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2779 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2780 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2781 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2782 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2783
2784 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2785 continue;
2786 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2787 myfile << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2788 }
2789 }
2790 }
2791 }
2792 }
2793 myfile.close();
2794#endif
2795}
2796
2797template <typename local_ordinal_type, typename ViewType>
2798void write5DMultiVectorValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2799#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2800 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2801 std::ofstream myfile;
2802 myfile.open(fileName);
2803
2804 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2805 const local_ordinal_type n_blocks = scalar_values.extent(1) * n_parts_per_pack;
2806 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2807
2808 const local_ordinal_type block_size = scalar_values.extent(2);
2809 const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2810 const local_ordinal_type n_cols = n_blocks_cols * block_size;
2811
2812 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2813 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2814
2815 const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2816
2817 myfile << "%%MatrixMarket matrix array real general" << std::endl;
2818 myfile << "%%block size = " << block_size;
2819 myfile << " number of blocks = " << n_blocks;
2820 myfile << " number of parts = " << n_parts;
2821 myfile << " number of blocks per part = " << n_blocks_per_part;
2822 myfile << " number of rows = " << n_rows;
2823 myfile << " number of cols = " << n_cols;
2824 myfile << " number of packs = " << n_packs << std::endl;
2825
2826 myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2827
2828 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2829 (void)current_row_offset;
2830 (void)current_part_idx;
2831 for (local_ordinal_type i_block_col = 0; i_block_col < n_blocks_cols; ++i_block_col) {
2832 for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2833 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2834 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2835 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2836 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2837 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2838
2839 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(1))
2840 continue;
2841 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2842 myfile << scalar_values(i_block_col, current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2843 }
2844 }
2845 }
2846 }
2847 }
2848 }
2849 myfile.close();
2850#endif
2851}
2852
2853template <typename local_ordinal_type, typename member_type, typename ViewType1, typename ViewType2>
2854KOKKOS_INLINE_FUNCTION void
2855copy3DView(const member_type &member, const ViewType1 &view1, const ViewType2 &view2) {
2856 /*
2857 // Kokkos::Experimental::local_deep_copy
2858 auto teamVectorRange =
2859 Kokkos::TeamVectorMDRange<Kokkos::Rank<3>, member_type>(
2860 member, view1.extent(0), view1.extent(1), view1.extent(2));
2861
2862 Kokkos::parallel_for
2863 (teamVectorRange,
2864 [&](const local_ordinal_type &i, const local_ordinal_type &j, const local_ordinal_type &k) {
2865 view1(i,j,k) = view2(i,j,k);
2866 });
2867 */
2868 Kokkos::Experimental::local_deep_copy(member, view1, view2);
2869}
2870template <typename MatrixType, int ScratchLevel>
2871struct ExtractAndFactorizeTridiags {
2872 public:
2873 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2874 // a functor cannot have both device_type and execution_space; specialization error in kokkos
2875 using execution_space = typename impl_type::execution_space;
2876 using memory_space = typename impl_type::memory_space;
2878 using local_ordinal_type = typename impl_type::local_ordinal_type;
2879 using size_type = typename impl_type::size_type;
2880 using impl_scalar_type = typename impl_type::impl_scalar_type;
2881 using magnitude_type = typename impl_type::magnitude_type;
2883 using row_matrix_type = typename impl_type::tpetra_row_matrix_type;
2884 using crs_graph_type = typename impl_type::tpetra_crs_graph_type;
2886 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2887 using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
2888 using size_type_1d_view = typename impl_type::size_type_1d_view;
2889 using size_type_2d_view = typename impl_type::size_type_2d_view;
2890 using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
2892 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2893 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2894 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2895 using vector_type_4d_view = typename impl_type::vector_type_4d_view;
2896 using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2897 using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
2898 using btdm_scalar_type_2d_view = typename impl_type::btdm_scalar_type_2d_view;
2899 using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
2900 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2901 using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view;
2902 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2903 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2904 using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
2905 using local_crs_graph_type = typename impl_type::local_crs_graph_type;
2906 using colinds_view = typename local_crs_graph_type::entries_type;
2907
2908 using internal_vector_type = typename impl_type::internal_vector_type;
2909 static constexpr int vector_length = impl_type::vector_length;
2910 static constexpr int internal_vector_length = impl_type::internal_vector_length;
2911 static_assert(vector_length >= internal_vector_length, "Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2912 static_assert(vector_length % internal_vector_length == 0, "Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2913 // half_vector_length is used for block Jacobi factorization.
2914 // Shared memory requirement is twice as large (per vector lane) as for general tridi factorization, so
2915 // reducing vector length (if possible) keeps the shared requirement constant. This avoids the performance
2916 // cliff of switching from level 0 to level 1 scratch.
2917 static constexpr int half_vector_length = impl_type::half_vector_length;
2918
2920 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2921 using member_type = typename team_policy_type::member_type;
2922
2923 private:
2924 // part interface
2925 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2926 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2927 const local_ordinal_type max_partsz;
2928 // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
2929 using size_type_1d_view_tpetra = Kokkos::View<size_t *, typename impl_type::node_device_type>;
2930 ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2931 ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2932 ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2933 // block tridiags
2934 const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2935 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2936 const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2937 const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2938 const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2939 const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2940 const Unmanaged<btdm_scalar_type_3d_view> d_inv;
2941 const Unmanaged<size_type_1d_view> diag_offsets;
2942 // shared information
2943 const local_ordinal_type blocksize, blocksize_square;
2944 // diagonal safety
2945 const magnitude_type tiny;
2946 const local_ordinal_type vector_loop_size;
2947
2948 bool hasBlockCrsMatrix;
2949
2950 public:
2951 ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
2952 const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2953 const Teuchos::RCP<const row_matrix_type> &A_,
2954 const Teuchos::RCP<const crs_graph_type> &G_,
2955 const magnitude_type &tiny_)
2956 : // interface
2957 partptr(interf_.partptr)
2958 , lclrow(interf_.lclrow)
2959 , packptr(interf_.packptr)
2960 , packindices_sub(interf_.packindices_sub)
2961 , packptr_sub(interf_.packptr_sub)
2962 , partptr_sub(interf_.partptr_sub)
2963 , part2packrowidx0_sub(interf_.part2packrowidx0_sub)
2964 , packindices_schur(interf_.packindices_schur)
2965 , max_partsz(interf_.max_partsz)
2966 ,
2967 // block tridiags
2968 pack_td_ptr(btdm_.pack_td_ptr)
2969 , flat_td_ptr(btdm_.flat_td_ptr)
2970 , pack_td_ptr_schur(btdm_.pack_td_ptr_schur)
2971 , A_colindsub(btdm_.A_colindsub)
2972 , internal_vector_values((internal_vector_type *)btdm_.values.data(),
2973 btdm_.values.extent(0),
2974 btdm_.values.extent(1),
2975 btdm_.values.extent(2),
2976 vector_length / internal_vector_length)
2977 , internal_vector_values_schur((internal_vector_type *)btdm_.values_schur.data(),
2978 btdm_.values_schur.extent(0),
2979 btdm_.values_schur.extent(1),
2980 btdm_.values_schur.extent(2),
2981 vector_length / internal_vector_length)
2982 , e_internal_vector_values((internal_vector_type *)btdm_.e_values.data(),
2983 btdm_.e_values.extent(0),
2984 btdm_.e_values.extent(1),
2985 btdm_.e_values.extent(2),
2986 btdm_.e_values.extent(3),
2987 vector_length / internal_vector_length)
2988 , scalar_values((btdm_scalar_type *)btdm_.values.data(),
2989 btdm_.values.extent(0),
2990 btdm_.values.extent(1),
2991 btdm_.values.extent(2),
2992 vector_length)
2993 , scalar_values_schur((btdm_scalar_type *)btdm_.values_schur.data(),
2994 btdm_.values_schur.extent(0),
2995 btdm_.values_schur.extent(1),
2996 btdm_.values_schur.extent(2),
2997 vector_length)
2998 , e_scalar_values((btdm_scalar_type *)btdm_.e_values.data(),
2999 btdm_.e_values.extent(0),
3000 btdm_.e_values.extent(1),
3001 btdm_.e_values.extent(2),
3002 btdm_.e_values.extent(3),
3003 vector_length)
3004 , d_inv(btdm_.d_inv)
3005 , diag_offsets(btdm_.diag_offsets)
3006 , blocksize(btdm_.values.extent(1))
3007 , blocksize_square(blocksize * blocksize)
3008 ,
3009 // diagonal weight to avoid zero pivots
3010 tiny(tiny_)
3011 , vector_loop_size(vector_length / internal_vector_length) {
3012 using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
3013 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
3014
3015 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_);
3016 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A_);
3017
3018 hasBlockCrsMatrix = !A_bcrs.is_null();
3019
3020 A_block_rowptr = G_->getLocalGraphDevice().row_map;
3021 if (hasBlockCrsMatrix) {
3022 A_values = const_cast<block_crs_matrix_type *>(A_bcrs.get())->getValuesDeviceNonConst();
3023 } else {
3024 A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
3025 A_values = A_crs->getLocalValuesDevice(Tpetra::Access::ReadOnly);
3026 }
3027 }
3028
3029 private:
3030 KOKKOS_INLINE_FUNCTION
3031 void
3032 extract(local_ordinal_type partidx,
3033 local_ordinal_type local_subpartidx,
3034 local_ordinal_type npacks) const {
3035#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3036 printf("extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
3037#endif
3038 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3039 const size_type kps = pack_td_ptr(partidx, local_subpartidx);
3040 local_ordinal_type kfs[vector_length] = {};
3041 local_ordinal_type ri0[vector_length] = {};
3042 local_ordinal_type nrows[vector_length] = {};
3043
3044 for (local_ordinal_type vi = 0; vi < npacks; ++vi, ++partidx) {
3045 kfs[vi] = flat_td_ptr(partidx, local_subpartidx);
3046 ri0[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 0);
3047 nrows[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 1) - ri0[vi];
3048#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3049 printf("kfs[%d] = %d;\n", vi, kfs[vi]);
3050 printf("ri0[%d] = %d;\n", vi, ri0[vi]);
3051 printf("nrows[%d] = %d;\n", vi, nrows[vi]);
3052#endif
3053 }
3054 local_ordinal_type tr_min = 0;
3055 local_ordinal_type tr_max = nrows[0];
3056 if (local_subpartidx % 2 == 1) {
3057 tr_min -= 1;
3058 tr_max += 1;
3059 }
3060#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3061 printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3062#endif
3063 for (local_ordinal_type tr = tr_min, j = 0; tr < tr_max; ++tr) {
3064 for (local_ordinal_type e = 0; e < 3; ++e) {
3065 if (hasBlockCrsMatrix) {
3066 const impl_scalar_type *block[vector_length] = {};
3067 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3068 const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
3069
3070 block[vi] = &A_values(Aj * blocksize_square);
3071 }
3072 const size_type pi = kps + j;
3073#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3074 printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
3075#endif
3076 ++j;
3077 for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
3078 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3079 const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
3080 auto &v = internal_vector_values(pi, ii, jj, 0);
3081 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3082 v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
3083 }
3084 }
3085 }
3086 } else {
3087 const size_type pi = kps + j;
3088
3089 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3090 const size_type Aj_c = A_colindsub(kfs[vi] + j);
3091
3092 for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
3093 auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr) * blocksize + ii);
3094
3095 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3096 scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c * blocksize + jj);
3097 }
3098 }
3099 }
3100 ++j;
3101 }
3102 if (nrows[0] == 1) break;
3103 if (local_subpartidx % 2 == 0) {
3104 if (e == 1 && (tr == 0 || tr + 1 == nrows[0])) break;
3105 for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3106 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr + 1 == nrows[vi])) {
3107 npacks = vi;
3108 break;
3109 }
3110 }
3111 } else {
3112 if (e == 0 && (tr == -1 || tr == nrows[0])) break;
3113 for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3114 if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3115 npacks = vi;
3116 break;
3117 }
3118 }
3119 }
3120 }
3121 }
3122 }
3123
3124 KOKKOS_INLINE_FUNCTION
3125 void
3126 extract(const member_type &member,
3127 const local_ordinal_type &partidxbeg,
3128 local_ordinal_type local_subpartidx,
3129 const local_ordinal_type &npacks,
3130 const local_ordinal_type &vbeg) const {
3131#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3132 printf("extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3133#endif
3134 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3135 local_ordinal_type kfs_vals[internal_vector_length] = {};
3136 local_ordinal_type ri0_vals[internal_vector_length] = {};
3137 local_ordinal_type nrows_vals[internal_vector_length] = {};
3138
3139 const size_type kps = pack_td_ptr(partidxbeg, local_subpartidx);
3140 for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3141 kfs_vals[vi] = flat_td_ptr(partidxbeg + vi, local_subpartidx);
3142 ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 0);
3143 nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 1) - ri0_vals[vi];
3144#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3145 printf("kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3146 printf("ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3147 printf("nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3148#endif
3149 }
3150
3151 local_ordinal_type j_vals[internal_vector_length] = {};
3152
3153 local_ordinal_type tr_min = 0;
3154 local_ordinal_type tr_max = nrows_vals[0];
3155 if (local_subpartidx % 2 == 1) {
3156 tr_min -= 1;
3157 tr_max += 1;
3158 }
3159#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3160 printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3161#endif
3162 for (local_ordinal_type tr = tr_min; tr < tr_max; ++tr) {
3163 for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3164 const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3165 if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows + 1)) {
3166 auto &j = j_vals[vi];
3167 const local_ordinal_type kfs = kfs_vals[vi];
3168 const local_ordinal_type ri0 = ri0_vals[vi];
3169 local_ordinal_type lbeg, lend;
3170 if (local_subpartidx % 2 == 0) {
3171 lbeg = (tr == tr_min ? 1 : 0);
3172 lend = (tr == nrows - 1 ? 2 : 3);
3173 } else {
3174 lbeg = 0;
3175 lend = 3;
3176 if (tr == tr_min) {
3177 lbeg = 1;
3178 lend = 2;
3179 } else if (tr == nrows) {
3180 lbeg = 0;
3181 lend = 1;
3182 }
3183 }
3184 if (hasBlockCrsMatrix) {
3185 for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3186 const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3187 const impl_scalar_type *block = &A_values(Aj * blocksize_square);
3188 const size_type pi = kps + j;
3189#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3190 printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3191#endif
3192 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3193 [&](const local_ordinal_type &ii) {
3194 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3195 scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii, jj, blocksize)]);
3196 }
3197 });
3198 }
3199 } else {
3200 for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3201 const size_type Aj_c = A_colindsub(kfs + j);
3202 const size_type pi = kps + j;
3203 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3204 [&](const local_ordinal_type &ii) {
3205 auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr) * blocksize + ii);
3206 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3207 scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c * blocksize + jj);
3208 }
3209 });
3210 }
3211 }
3212 }
3213 }
3214 }
3215 }
3216
3217 template <typename AAViewType,
3218 typename WWViewType>
3219 KOKKOS_INLINE_FUNCTION void
3220 factorize_subline(const member_type &member,
3221 const local_ordinal_type &i0,
3222 const local_ordinal_type &nrows,
3223 const local_ordinal_type &v,
3224 const AAViewType &AA,
3225 const WWViewType &WW) const {
3226 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3227
3228 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3229 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3230
3231 // constant
3232#if KOKKOS_VERSION >= 40799
3233 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3234#else
3235 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3236#endif
3237
3238#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3239 printf("i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3240#endif
3241
3242 // subview pattern
3243 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3244 KB::LU<member_type,
3245 default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3246
3247 if (nrows > 1) {
3248 auto B = A;
3249 auto C = A;
3250 local_ordinal_type i = i0;
3251 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
3252#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3253 printf("tr = %d, i = %d;\n", tr, i);
3254#endif
3255 B.assign_data(&AA(i + 1, 0, 0, v));
3256 KB::Trsm<member_type,
3257 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3258 default_mode_type, default_algo_type>::invoke(member, one, A, B);
3259 C.assign_data(&AA(i + 2, 0, 0, v));
3260 KB::Trsm<member_type,
3261 KB::Side::Right, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3262 default_mode_type, default_algo_type>::invoke(member, one, A, C);
3263 A.assign_data(&AA(i + 3, 0, 0, v));
3264
3265 member.team_barrier();
3266 KB::Gemm<member_type,
3267 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3268 default_mode_type, default_algo_type>::invoke(member, -one, C, B, one, A);
3269 KB::LU<member_type,
3270 default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3271 }
3272 } else {
3273 // for block jacobi invert a matrix here
3274 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3275 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, A, W);
3276 KB::SetIdentity<member_type, default_mode_type>::invoke(member, A);
3277 member.team_barrier();
3278 KB::Trsm<member_type,
3279 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3280 default_mode_type, default_algo_type>::invoke(member, one, W, A);
3281 KB::Trsm<member_type,
3282 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3283 default_mode_type, default_algo_type>::invoke(member, one, W, A);
3284 }
3285 }
3286
3287 public:
3288 struct ExtractAndFactorizeSubLineTag {};
3289 struct ExtractAndFactorizeFusedJacobiTag {};
3290 struct ExtractBCDTag {};
3291 struct ComputeETag {};
3292 struct ComputeSchurTag {};
3293 struct FactorizeSchurTag {};
3294
3295 KOKKOS_INLINE_FUNCTION
3296 void
3297 operator()(const ExtractAndFactorizeSubLineTag &, const member_type &member) const {
3298 // btdm is packed and sorted from largest one
3299 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3300
3301 const local_ordinal_type subpartidx = packptr_sub(packidx);
3302 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3303 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3304 const local_ordinal_type partidx = subpartidx % n_parts;
3305
3306 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3307 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3308 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3309
3310 internal_vector_scratch_type_3d_view
3311 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3312
3313#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3314 printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3315 printf("vector_loop_size = %d\n", vector_loop_size);
3316#endif
3317
3318 if (vector_loop_size == 1) {
3319 extract(partidx, local_subpartidx, npacks);
3320 factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3321 } else {
3322 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3323 [&](const local_ordinal_type &v) {
3324 const local_ordinal_type vbeg = v * internal_vector_length;
3325#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3326 printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3327#endif
3328 if (vbeg < npacks)
3329 extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3330 // this is not safe if vector loop size is different from vector size of
3331 // the team policy. we always make sure this when constructing the team policy
3332 member.team_barrier();
3333 factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3334 });
3335 }
3336 }
3337
3338 KOKKOS_INLINE_FUNCTION
3339 void
3340 operator()(const ExtractAndFactorizeFusedJacobiTag &, const member_type &member) const {
3341 using default_mode_and_algo_type = ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>;
3342 using default_mode_type = typename default_mode_and_algo_type::mode_type;
3343 using default_algo_type = typename default_mode_and_algo_type::algo_type;
3344 // When fused block Jacobi can be used, the mapping between local rows and parts is trivial (i <-> i)
3345 // We can simply pull the diagonal entry from A into d_inv
3346 btdm_scalar_scratch_type_3d_view WW1(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3347 btdm_scalar_scratch_type_3d_view WW2(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3348#if KOKKOS_VERSION >= 40799
3349 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3350#else
3351 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3352#endif
3353 const local_ordinal_type nrows = lclrow.extent(0);
3354 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, half_vector_length),
3355 [&](const local_ordinal_type &v) {
3356 local_ordinal_type row = member.league_rank() * half_vector_length + v;
3357 // diagEntry has index of diagonal within row
3358 auto W1 = Kokkos::subview(WW1, v, Kokkos::ALL(), Kokkos::ALL());
3359 auto W2 = Kokkos::subview(WW2, v, Kokkos::ALL(), Kokkos::ALL());
3360 if (row < nrows) {
3361 // View the diagonal block of A in row as 2D row-major
3362 const impl_scalar_type *A_diag = A_values.data() + diag_offsets(row);
3363 // Copy the diag into scratch slice W1
3364 // (copying elements directly is better than KokkosBatched copy)
3365 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3366 [&](int i) {
3367 W1.data()[i] = A_diag[i];
3368 });
3369 // and set W2 to identity in preparation to invert with 2 x Trsm
3370 KB::SetIdentity<member_type, default_mode_type>::invoke(member, W2);
3371 } else {
3372 // if this vector lane has no block to invert, then set W1 to identity
3373 // so that LU still has a matrix to work on. LU uses team barriers so
3374 // having some lanes run it and some not will deadlock.
3375 KB::SetIdentity<member_type, default_mode_type>::invoke(member, W1);
3376 }
3377 member.team_barrier();
3378 // LU factorize in-place
3379 KB::LU<member_type, default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, W1, tiny);
3380 member.team_barrier();
3381 KB::Trsm<member_type,
3382 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3383 default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3384 KB::Trsm<member_type,
3385 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3386 default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3387 member.team_barrier();
3388 if (row < nrows) {
3389 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3390 [&](int i) {
3391 auto d_inv_block = &d_inv(row, 0, 0);
3392 d_inv_block[i] = W2.data()[i];
3393 });
3394 }
3395 });
3396 }
3397
3398 KOKKOS_INLINE_FUNCTION
3399 void
3400 operator()(const ExtractBCDTag &, const member_type &member) const {
3401 // btdm is packed and sorted from largest one
3402 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3403 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3404 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3405
3406 const local_ordinal_type subpartidx = packptr_sub(packidx);
3407 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3408 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3409 const local_ordinal_type partidx = subpartidx % n_parts;
3410
3411 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3412 // const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3413 // const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3414
3415 if (vector_loop_size == 1) {
3416 extract(partidx, local_subpartidx, npacks);
3417 } else {
3418 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3419 [&](const local_ordinal_type &v) {
3420 const local_ordinal_type vbeg = v * internal_vector_length;
3421#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3422 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3423 printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3424#endif
3425 if (vbeg < npacks)
3426 extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3427 });
3428 }
3429
3430 member.team_barrier();
3431
3432 const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3433 const size_type kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 1;
3434
3435 const local_ordinal_type r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3436 const local_ordinal_type r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3437
3438#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3439 printf("Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3440#endif
3441
3442 // Need to copy D to e_internal_vector_values.
3443 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3444 Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3445
3446 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3447 Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3448 }
3449
3450 KOKKOS_INLINE_FUNCTION
3451 void
3452 operator()(const ComputeETag &, const member_type &member) const {
3453 // btdm is packed and sorted from largest one
3454 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3455
3456 const local_ordinal_type subpartidx = packptr_sub(packidx);
3457 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3458 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3459 const local_ordinal_type partidx = subpartidx % n_parts;
3460
3461 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3462 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3463 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
3464 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3465 const local_ordinal_type num_vectors = blocksize;
3466
3467 (void)npacks;
3468
3469 internal_vector_scratch_type_3d_view
3470 WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3471 if (local_subpartidx == 0) {
3472 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3473 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3474 });
3475 } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
3476 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3477 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3478 });
3479 } else {
3480 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3481 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3482 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3483 });
3484 }
3485 }
3486
3487 KOKKOS_INLINE_FUNCTION
3488 void
3489 operator()(const ComputeSchurTag &, const member_type &member) const {
3490 // btdm is packed and sorted from largest one
3491 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3492 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3493 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3494
3495 const local_ordinal_type subpartidx = packptr_sub(packidx);
3496 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3497 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3498 const local_ordinal_type partidx = subpartidx % n_parts;
3499
3500 // const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3501 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3502 // const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3503 // const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3504
3505 // Compute S = D - C E
3506
3507 const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
3508 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
3509 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
3510
3511 for (local_ordinal_type i = 0; i < 4; ++i) { // pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-i0_schur
3512 copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3513 Kokkos::subview(internal_vector_values, i0_offset + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3514 }
3515
3516 member.team_barrier();
3517
3518#if KOKKOS_VERSION >= 40799
3519 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3520#else
3521 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3522#endif
3523
3524 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx) + 1;
3525 const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 2;
3526
3527 const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3528 const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3529
3530 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3531
3532 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3533 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3534
3535 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3536 for (size_type i = 0; i < pack_td_ptr_schur(partidx, local_subpartidx_schur + 1) - pack_td_ptr_schur(partidx, local_subpartidx_schur); ++i) {
3537 local_ordinal_type e_r, e_c, c_kps;
3538
3539 if (local_subpartidx_schur == 0) {
3540 if (i == 0) {
3541 e_r = e_r1;
3542 e_c = 0;
3543 c_kps = c_kps1;
3544 } else if (i == 3) {
3545 e_r = e_r2;
3546 e_c = 1;
3547 c_kps = c_kps2;
3548 } else if (i == 4) {
3549 e_r = e_r2;
3550 e_c = 0;
3551 c_kps = c_kps2;
3552 } else {
3553 continue;
3554 }
3555 } else {
3556 if (i == 0) {
3557 e_r = e_r1;
3558 e_c = 1;
3559 c_kps = c_kps1;
3560 } else if (i == 1) {
3561 e_r = e_r1;
3562 e_c = 0;
3563 c_kps = c_kps1;
3564 } else if (i == 4) {
3565 e_r = e_r2;
3566 e_c = 1;
3567 c_kps = c_kps2;
3568 } else if (i == 5) {
3569 e_r = e_r2;
3570 e_c = 0;
3571 c_kps = c_kps2;
3572 } else {
3573 continue;
3574 }
3575 }
3576
3577 auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx, local_subpartidx_schur) + i, Kokkos::ALL(), Kokkos::ALL(), v);
3578 auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3579 auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3580 KB::Gemm<member_type,
3581 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3582 default_mode_type, default_algo_type>::invoke(member, -one, C, E, one, S);
3583 }
3584 });
3585 }
3586
3587 KOKKOS_INLINE_FUNCTION
3588 void
3589 operator()(const FactorizeSchurTag &, const member_type &member) const {
3590 const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3591
3592 const local_ordinal_type subpartidx = packptr_sub(packidx);
3593
3594 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3595 const local_ordinal_type partidx = subpartidx % n_parts;
3596
3597 const local_ordinal_type i0 = pack_td_ptr_schur(partidx, 0);
3598 const local_ordinal_type nrows = 2 * (pack_td_ptr_schur.extent(1) - 1);
3599
3600 internal_vector_scratch_type_3d_view
3601 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3602
3603#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3604 printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3605#endif
3606
3607 if (vector_loop_size == 1) {
3608 factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3609 } else {
3610 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3611 [&](const local_ordinal_type &v) {
3612 factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3613 });
3614 }
3615 }
3616
3617 void run() {
3618 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3619 const local_ordinal_type team_size =
3620 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3621 recommended_team_size(blocksize, vector_length, internal_vector_length);
3622 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3623 shmem_size(blocksize, blocksize, vector_loop_size);
3624
3625 {
3626#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3627 printf("Start ExtractAndFactorizeSubLineTag\n");
3628#endif
3629 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3630 Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeSubLineTag>
3631 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3632
3633 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3634 writeBTDValuesToFile(n_parts, scalar_values, "before.mm");
3635
3636 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3637 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3638 policy, *this);
3639 execution_space().fence();
3640
3641 writeBTDValuesToFile(n_parts, scalar_values, "after.mm");
3642#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3643 printf("End ExtractAndFactorizeSubLineTag\n");
3644#endif
3645 }
3646
3647 if (packindices_schur.extent(1) > 0) {
3648 {
3649#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3650 printf("Start ExtractBCDTag\n");
3651#endif
3652#if KOKKOS_VERSION >= 40799
3653 Kokkos::deep_copy(e_scalar_values, KokkosKernels::ArithTraits<btdm_magnitude_type>::zero());
3654#else
3655 Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3656#endif
3657#if KOKKOS_VERSION >= 40799
3658 Kokkos::deep_copy(scalar_values_schur, KokkosKernels::ArithTraits<btdm_magnitude_type>::zero());
3659#else
3660 Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3661#endif
3662
3663 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_before_extract.mm");
3664
3665 {
3666 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3667 Kokkos::TeamPolicy<execution_space, ExtractBCDTag>
3668 policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3669
3670 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3671 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3672 policy, *this);
3673 execution_space().fence();
3674 }
3675
3676#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3677 printf("End ExtractBCDTag\n");
3678#endif
3679 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values, "after_extraction_of_BCD.mm");
3680#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3681 printf("Start ComputeETag\n");
3682#endif
3683 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_extract.mm");
3684 {
3685 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3686 Kokkos::TeamPolicy<execution_space, ComputeETag>
3687 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3688
3689 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3690 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3691 policy, *this);
3692 execution_space().fence();
3693 }
3694 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_compute.mm");
3695
3696#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3697 printf("End ComputeETag\n");
3698#endif
3699 }
3700
3701 {
3702#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3703 printf("Start ComputeSchurTag\n");
3704#endif
3705 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3706 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "before_schur.mm");
3707 Kokkos::TeamPolicy<execution_space, ComputeSchurTag>
3708 policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3709
3710 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3711 policy, *this);
3712 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm");
3713 execution_space().fence();
3714#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3715 printf("End ComputeSchurTag\n");
3716#endif
3717 }
3718
3719 {
3720#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3721 printf("Start FactorizeSchurTag\n");
3722#endif
3723 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3724 Kokkos::TeamPolicy<execution_space, FactorizeSchurTag>
3725 policy(packindices_schur.extent(0), team_size, vector_loop_size);
3726 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3727 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3728 policy, *this);
3729 execution_space().fence();
3730 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_factor_schur.mm");
3731#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3732 printf("End FactorizeSchurTag\n");
3733#endif
3734 }
3735 }
3736
3737 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3738 }
3739
3740 void run_fused_jacobi() {
3741 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3742 const local_ordinal_type team_size =
3743 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3744 recommended_team_size(blocksize, half_vector_length, 1);
3745 const local_ordinal_type per_team_scratch =
3746 btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * half_vector_length);
3747 {
3748 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeFusedJacobi", ExtractAndFactorizeFusedJacobiTag);
3749 Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeFusedJacobiTag>
3750 policy((lclrow.extent(0) + half_vector_length - 1) / half_vector_length, team_size, half_vector_length);
3751
3752 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3753 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeFusedJacobiTag>",
3754 policy, *this);
3755 }
3756 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3757 }
3758};
3759
3763template <typename MatrixType>
3764void performNumericPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
3765 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3766 const BlockHelperDetails::PartInterface<MatrixType> &interf,
3768 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny,
3769 bool use_fused_jacobi) {
3771 using execution_space = typename impl_type::execution_space;
3772 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3773 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3774 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
3775
3776 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase", NumericPhase);
3777
3778 int blocksize = btdm.values.extent(1);
3779 // Both Kokkos policy vector length and SIMD type vector length are hardcoded in KokkosBatched.
3780 // For large block sizes, have to fall back to level 1 scratch.
3781 int scratch_required;
3782 if (!use_fused_jacobi) {
3783 // General path scratch requirement
3784 scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3785 } else {
3786 // Block Jacobi scratch requirement: measured in scalars, and uses twice as much (in bytes) per vector lane as the general path.
3787 scratch_required = btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * impl_type::half_vector_length);
3788 }
3789
3790 int max_scratch = team_policy_type::scratch_size_max(0);
3791
3792 if (scratch_required < max_scratch) {
3793 // Can use level 0 scratch
3794 ExtractAndFactorizeTridiags<MatrixType, 0> function(btdm, interf, A, G, tiny);
3795 if (!use_fused_jacobi)
3796 function.run();
3797 else
3798 function.run_fused_jacobi();
3799 } else {
3800 // Not enough level 0 scratch, so fall back to level 1
3801 ExtractAndFactorizeTridiags<MatrixType, 1> function(btdm, interf, A, G, tiny);
3802 if (!use_fused_jacobi)
3803 function.run();
3804 else
3805 function.run_fused_jacobi();
3806 }
3807 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3808}
3809
3813template <typename MatrixType>
3815 public:
3817 using execution_space = typename impl_type::execution_space;
3818 using memory_space = typename impl_type::memory_space;
3819
3820 using local_ordinal_type = typename impl_type::local_ordinal_type;
3821 using impl_scalar_type = typename impl_type::impl_scalar_type;
3822 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3823 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3824 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3825 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3826 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
3827 using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
3828 static constexpr int vector_length = impl_type::vector_length;
3829
3830 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3831
3832 private:
3833 // part interface
3834 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3835 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3836 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3837 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3838 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3839 const local_ordinal_type blocksize;
3840 const local_ordinal_type num_vectors;
3841
3842 // packed multivector output (or input)
3843 vector_type_3d_view packed_multivector;
3844 const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3845
3846 template <typename TagType>
3847 KOKKOS_INLINE_FUNCTION void copy_multivectors(const local_ordinal_type &j,
3848 const local_ordinal_type &vi,
3849 const local_ordinal_type &pri,
3850 const local_ordinal_type &ri0) const {
3851 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3852 for (local_ordinal_type i = 0; i < blocksize; ++i)
3853 packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3854 }
3855
3856 public:
3857 MultiVectorConverter(const BlockHelperDetails::PartInterface<MatrixType> &interf,
3858 const vector_type_3d_view &pmv)
3859 : partptr(interf.partptr)
3860 , packptr(interf.packptr)
3861 , part2packrowidx0(interf.part2packrowidx0)
3862 , part2rowidx0(interf.part2rowidx0)
3863 , lclrow(interf.lclrow)
3864 , blocksize(pmv.extent(1))
3865 , num_vectors(pmv.extent(2))
3866 , packed_multivector(pmv) {}
3867
3868 // TODO:: modify this routine similar to the team level functions
3869 KOKKOS_INLINE_FUNCTION
3870 void
3871 operator()(const local_ordinal_type &packidx) const {
3872 local_ordinal_type partidx = packptr(packidx);
3873 local_ordinal_type npacks = packptr(packidx + 1) - partidx;
3874 const local_ordinal_type pri0 = part2packrowidx0(partidx);
3875
3876 local_ordinal_type ri0[vector_length] = {};
3877 local_ordinal_type nrows[vector_length] = {};
3878 for (local_ordinal_type v = 0; v < npacks; ++v, ++partidx) {
3879 ri0[v] = part2rowidx0(partidx);
3880 nrows[v] = part2rowidx0(partidx + 1) - ri0[v];
3881 }
3882 for (local_ordinal_type j = 0; j < nrows[0]; ++j) {
3883 local_ordinal_type cnt = 1;
3884 for (; cnt < npacks && j != nrows[cnt]; ++cnt)
3885 ;
3886 npacks = cnt;
3887 const local_ordinal_type pri = pri0 + j;
3888 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3889 for (local_ordinal_type i = 0; i < blocksize; ++i)
3890 for (local_ordinal_type v = 0; v < npacks; ++v)
3891 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0[v] + j) + i, col));
3892 }
3893 }
3894
3895 KOKKOS_INLINE_FUNCTION
3896 void
3897 operator()(const member_type &member) const {
3898 const local_ordinal_type packidx = member.league_rank();
3899 const local_ordinal_type partidx_begin = packptr(packidx);
3900 const local_ordinal_type npacks = packptr(packidx + 1) - partidx_begin;
3901 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3902 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
3903 const local_ordinal_type partidx = partidx_begin + v;
3904 const local_ordinal_type ri0 = part2rowidx0(partidx);
3905 const local_ordinal_type nrows = part2rowidx0(partidx + 1) - ri0;
3906
3907 if (nrows == 1) {
3908 const local_ordinal_type pri = pri0;
3909 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
3910 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
3911 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0) + i, col));
3912 });
3913 }
3914 } else {
3915 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
3916 const local_ordinal_type pri = pri0 + j;
3917 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3918 for (local_ordinal_type i = 0; i < blocksize; ++i)
3919 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3920 });
3921 }
3922 });
3923 }
3924
3925 void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3926 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3927 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3928
3929 scalar_multivector = scalar_multivector_;
3930 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3931 const local_ordinal_type vl = vector_length;
3932 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3933 Kokkos::parallel_for("MultiVectorConverter::TeamPolicy", policy, *this);
3934 } else {
3935 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3936 Kokkos::parallel_for("MultiVectorConverter::RangePolicy", policy, *this);
3937 }
3938 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3939 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3940 }
3941};
3942
3946
3947template <>
3948struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3949 typedef KB::Mode::Serial mode_type;
3950 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3951#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3952 typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3953#else
3954 typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3955#endif
3956 static int recommended_team_size(const int /* blksize */,
3957 const int /* vector_length */,
3958 const int /* internal_vector_length */) {
3959 return 1;
3960 }
3961};
3962
3963#if defined(KOKKOS_ENABLE_CUDA)
3964static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
3965 const int vector_length,
3966 const int internal_vector_length) {
3967 const int vector_size = vector_length / internal_vector_length;
3968 int total_team_size(0);
3969 if (blksize <= 5)
3970 total_team_size = 32;
3971 else if (blksize <= 9)
3972 total_team_size = 32; // 64
3973 else if (blksize <= 12)
3974 total_team_size = 96;
3975 else if (blksize <= 16)
3976 total_team_size = 128;
3977 else if (blksize <= 20)
3978 total_team_size = 160;
3979 else
3980 total_team_size = 160;
3981 return total_team_size / vector_size;
3982}
3983
3984template <>
3985struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3986 typedef KB::Mode::Team mode_type;
3987 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3988 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3989 static int recommended_team_size(const int blksize,
3990 const int vector_length,
3991 const int internal_vector_length) {
3992 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3993 }
3994};
3995template <>
3996struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3997 typedef KB::Mode::Team mode_type;
3998 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3999 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4000 static int recommended_team_size(const int blksize,
4001 const int vector_length,
4002 const int internal_vector_length) {
4003 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
4004 }
4005};
4006#endif
4007
4008#if defined(KOKKOS_ENABLE_HIP)
4009static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
4010 const int vector_length,
4011 const int internal_vector_length) {
4012 const int vector_size = vector_length / internal_vector_length;
4013 int total_team_size(0);
4014 if (blksize <= 5)
4015 total_team_size = 32;
4016 else if (blksize <= 9)
4017 total_team_size = 32; // 64
4018 else if (blksize <= 12)
4019 total_team_size = 96;
4020 else if (blksize <= 16)
4021 total_team_size = 128;
4022 else if (blksize <= 20)
4023 total_team_size = 160;
4024 else
4025 total_team_size = 160;
4026 return total_team_size / vector_size;
4027}
4028
4029template <>
4030struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
4031 typedef KB::Mode::Team mode_type;
4032 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4033 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4034 static int recommended_team_size(const int blksize,
4035 const int vector_length,
4036 const int internal_vector_length) {
4037 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
4038 }
4039};
4040template <>
4041struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
4042 typedef KB::Mode::Team mode_type;
4043 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4044 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4045 static int recommended_team_size(const int blksize,
4046 const int vector_length,
4047 const int internal_vector_length) {
4048 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
4049 }
4050};
4051#endif
4052
4053#if defined(KOKKOS_ENABLE_SYCL)
4054static inline int SolveTridiagsRecommendedSYCLTeamSize(const int blksize,
4055 const int vector_length,
4056 const int internal_vector_length) {
4057 const int vector_size = vector_length / internal_vector_length;
4058 int total_team_size(0);
4059 if (blksize <= 5)
4060 total_team_size = 32;
4061 else if (blksize <= 9)
4062 total_team_size = 32; // 64
4063 else if (blksize <= 12)
4064 total_team_size = 96;
4065 else if (blksize <= 16)
4066 total_team_size = 128;
4067 else if (blksize <= 20)
4068 total_team_size = 160;
4069 else
4070 total_team_size = 160;
4071 return total_team_size / vector_size;
4072}
4073
4074template <>
4075struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
4076 typedef KB::Mode::Team mode_type;
4077 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4078 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4079 static int recommended_team_size(const int blksize,
4080 const int vector_length,
4081 const int internal_vector_length) {
4082 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4083 }
4084};
4085template <>
4086struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
4087 typedef KB::Mode::Team mode_type;
4088 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4089 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4090 static int recommended_team_size(const int blksize,
4091 const int vector_length,
4092 const int internal_vector_length) {
4093 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4094 }
4095};
4096#endif
4097
4098template <typename MatrixType>
4099struct SolveTridiags {
4100 public:
4101 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4102 using execution_space = typename impl_type::execution_space;
4103
4104 using local_ordinal_type = typename impl_type::local_ordinal_type;
4105 using size_type = typename impl_type::size_type;
4106 using impl_scalar_type = typename impl_type::impl_scalar_type;
4107 using magnitude_type = typename impl_type::magnitude_type;
4108 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
4109 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
4111 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4112 using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
4113 using size_type_2d_view = typename impl_type::size_type_2d_view;
4115 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4116 using internal_vector_type_3d_view = typename impl_type::internal_vector_type_3d_view;
4117 using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
4118 using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
4119 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
4120
4121 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
4122
4123 using internal_vector_type = typename impl_type::internal_vector_type;
4124 static constexpr int vector_length = impl_type::vector_length;
4125 static constexpr int internal_vector_length = impl_type::internal_vector_length;
4126
4128 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4129 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
4130
4132 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
4133 using member_type = typename team_policy_type::member_type;
4134
4135 private:
4136 // part interface
4137 local_ordinal_type n_subparts_per_part;
4138 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
4139 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
4140 const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
4141 const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
4142 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
4143 const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
4144 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
4145 const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
4146
4147 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
4148 const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
4149
4150 // block tridiags
4151 const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
4152
4153 // block tridiags values
4154 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
4155 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
4156 const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
4157
4158 const Unmanaged<internal_vector_type_3d_view> X_internal_vector_values_schur;
4159
4160 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
4161 const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
4162
4163 const local_ordinal_type vector_loop_size;
4164
4165 // copy to multivectors : damping factor and Y_scalar_multivector
4166 Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
4167#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
4168 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4169#else
4170 /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4171#endif
4172 const impl_scalar_type df;
4173 const bool compute_diff;
4174 // Schur solve only supports solving one vector at a time (currently).
4175 // If solving on a multivector, we loop over each vec in the solve.
4176 // This is the current vec being solved.
4177 local_ordinal_type active_schur_solve_vec;
4178
4179 public:
4180 SolveTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf,
4181 const BlockTridiags<MatrixType> &btdm,
4182 const vector_type_3d_view &pmv,
4183 const impl_scalar_type damping_factor,
4184 const bool is_norm_manager_active)
4185 : // interface
4186 n_subparts_per_part(interf.n_subparts_per_part)
4187 , partptr(interf.partptr)
4188 , packptr(interf.packptr)
4189 , packindices_sub(interf.packindices_sub)
4190 , packindices_schur(interf.packindices_schur)
4191 , part2packrowidx0(interf.part2packrowidx0)
4192 , part2packrowidx0_sub(interf.part2packrowidx0_sub)
4193 , lclrow(interf.lclrow)
4194 , packptr_sub(interf.packptr_sub)
4195 , partptr_sub(interf.partptr_sub)
4196 , pack_td_ptr_schur(btdm.pack_td_ptr_schur)
4197 ,
4198 // block tridiags and multivector
4199 pack_td_ptr(btdm.pack_td_ptr)
4200 , D_internal_vector_values((internal_vector_type *)btdm.values.data(),
4201 btdm.values.extent(0),
4202 btdm.values.extent(1),
4203 btdm.values.extent(2),
4204 vector_length / internal_vector_length)
4205 , X_internal_vector_values((internal_vector_type *)pmv.data(),
4206 pmv.extent(0),
4207 pmv.extent(1),
4208 pmv.extent(2),
4209 vector_length / internal_vector_length)
4210 , X_internal_scalar_values((btdm_scalar_type *)pmv.data(),
4211 pmv.extent(0),
4212 pmv.extent(1),
4213 pmv.extent(2),
4214 vector_length)
4215 , X_internal_vector_values_schur(btdm.X_internal_vector_values_schur)
4216 , D_internal_vector_values_schur((internal_vector_type *)btdm.values_schur.data(),
4217 btdm.values_schur.extent(0),
4218 btdm.values_schur.extent(1),
4219 btdm.values_schur.extent(2),
4220 vector_length / internal_vector_length)
4221 , e_internal_vector_values((internal_vector_type *)btdm.e_values.data(),
4222 btdm.e_values.extent(0),
4223 btdm.e_values.extent(1),
4224 btdm.e_values.extent(2),
4225 btdm.e_values.extent(3),
4226 vector_length / internal_vector_length)
4227 , vector_loop_size(vector_length / internal_vector_length)
4228 , Y_scalar_multivector()
4229 , Z_scalar_vector()
4230 , df(damping_factor)
4231 , compute_diff(is_norm_manager_active)
4232 , active_schur_solve_vec(0) {}
4233
4234 public:
4236 KOKKOS_INLINE_FUNCTION
4237 void
4238 copyToFlatMultiVector(const member_type &member,
4239 const local_ordinal_type partidxbeg, // partidx for v = 0
4240 const local_ordinal_type npacks,
4241 const local_ordinal_type pri0,
4242 const local_ordinal_type v, // index with a loop of vector_loop_size
4243 const local_ordinal_type blocksize,
4244 const local_ordinal_type num_vectors) const {
4245 const local_ordinal_type vbeg = v * internal_vector_length;
4246 if (vbeg < npacks) {
4247 local_ordinal_type ri0_vals[internal_vector_length] = {};
4248 local_ordinal_type nrows_vals[internal_vector_length] = {};
4249 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4250 const local_ordinal_type partidx = partidxbeg + vv;
4251 ri0_vals[vi] = partptr(partidx);
4252 nrows_vals[vi] = partptr(partidx + 1) - ri0_vals[vi];
4253 }
4254
4255 impl_scalar_type z_partial_sum(0);
4256 if (nrows_vals[0] == 1) {
4257 const local_ordinal_type j = 0, pri = pri0;
4258 {
4259 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4260 const local_ordinal_type ri0 = ri0_vals[vi];
4261 const local_ordinal_type nrows = nrows_vals[vi];
4262 if (j < nrows) {
4263 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
4264 [&](const local_ordinal_type &i) {
4265 const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4266 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4267 impl_scalar_type &y = Y_scalar_multivector(row, col);
4268 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4269 y += df * yd;
4270
4271 { // if (compute_diff) {
4272#if KOKKOS_VERSION >= 40799
4273 const auto yd_abs = KokkosKernels::ArithTraits<impl_scalar_type>::abs(yd);
4274#else
4275 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4276#endif
4277 z_partial_sum += yd_abs * yd_abs;
4278 }
4279 }
4280 });
4281 }
4282 }
4283 }
4284 } else {
4285 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows_vals[0]),
4286 [&](const local_ordinal_type &j) {
4287 const local_ordinal_type pri = pri0 + j;
4288 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4289 const local_ordinal_type ri0 = ri0_vals[vi];
4290 const local_ordinal_type nrows = nrows_vals[vi];
4291 if (j < nrows) {
4292 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4293 for (local_ordinal_type i = 0; i < blocksize; ++i) {
4294 const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4295 impl_scalar_type &y = Y_scalar_multivector(row, col);
4296 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4297 y += df * yd;
4298
4299 { // if (compute_diff) {
4300#if KOKKOS_VERSION >= 40799
4301 const auto yd_abs = KokkosKernels::ArithTraits<impl_scalar_type>::abs(yd);
4302#else
4303 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4304#endif
4305 z_partial_sum += yd_abs * yd_abs;
4306 }
4307 }
4308 }
4309 }
4310 }
4311 });
4312 }
4313 // if (compute_diff)
4314 Z_scalar_vector(member.league_rank()) += z_partial_sum;
4315 }
4316 }
4317
4321 template <typename WWViewType>
4322 KOKKOS_INLINE_FUNCTION void
4323 solveSingleVector(const member_type &member,
4324 const local_ordinal_type &blocksize,
4325 const local_ordinal_type &i0,
4326 const local_ordinal_type &r0,
4327 const local_ordinal_type &nrows,
4328 const local_ordinal_type &v,
4329 const WWViewType &WW) const {
4330 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4331
4332 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4333 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4334
4335 // base pointers
4336 auto A = D_internal_vector_values.data();
4337 auto X = X_internal_vector_values.data();
4338
4339 // constant
4340#if KOKKOS_VERSION >= 40799
4341 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4342#else
4343 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4344#endif
4345#if KOKKOS_VERSION >= 40799
4346 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
4347#else
4348 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4349#endif
4350 // const local_ordinal_type num_vectors = X_scalar_values.extent(2);
4351
4352 // const local_ordinal_type blocksize = D_scalar_values.extent(1);
4353 const local_ordinal_type astep = D_internal_vector_values.stride(0);
4354 const local_ordinal_type as0 = D_internal_vector_values.stride(1); // blocksize*vector_length;
4355 const local_ordinal_type as1 = D_internal_vector_values.stride(2); // vector_length;
4356 const local_ordinal_type xstep = X_internal_vector_values.stride(0);
4357 const local_ordinal_type xs0 = X_internal_vector_values.stride(1); // vector_length;
4358
4359 // move to starting point
4360 A += i0 * astep + v;
4361 X += r0 * xstep + v;
4362
4363 // for (local_ordinal_type col=0;col<num_vectors;++col)
4364 if (nrows > 1) {
4365 // solve Lx = x
4366 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4367 member,
4368 KB::Diag::Unit,
4369 blocksize, blocksize,
4370 one,
4371 A, as0, as1,
4372 X, xs0);
4373
4374 for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
4375 member.team_barrier();
4376 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4377 member,
4378 blocksize, blocksize,
4379 -one,
4380 A + 2 * astep, as0, as1,
4381 X, xs0,
4382 one,
4383 X + 1 * xstep, xs0);
4384 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4385 member,
4386 KB::Diag::Unit,
4387 blocksize, blocksize,
4388 one,
4389 A + 3 * astep, as0, as1,
4390 X + 1 * xstep, xs0);
4391
4392 A += 3 * astep;
4393 X += 1 * xstep;
4394 }
4395
4396 // solve Ux = x
4397 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4398 member,
4399 KB::Diag::NonUnit,
4400 blocksize, blocksize,
4401 one,
4402 A, as0, as1,
4403 X, xs0);
4404
4405 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4406 A -= 3 * astep;
4407 member.team_barrier();
4408 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4409 member,
4410 blocksize, blocksize,
4411 -one,
4412 A + 1 * astep, as0, as1,
4413 X, xs0,
4414 one,
4415 X - 1 * xstep, xs0);
4416 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4417 member,
4418 KB::Diag::NonUnit,
4419 blocksize, blocksize,
4420 one,
4421 A, as0, as1,
4422 X - 1 * xstep, xs0);
4423 X -= 1 * xstep;
4424 }
4425 // for multiple rhs
4426 // X += xs1;
4427 } else {
4428 const local_ordinal_type ws0 = WW.stride(0);
4429 auto W = WW.data() + v;
4430 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
4431 member, blocksize, X, xs0, W, ws0);
4432 member.team_barrier();
4433 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4434 member,
4435 blocksize, blocksize,
4436 one,
4437 A, as0, as1,
4438 W, xs0,
4439 zero,
4440 X, xs0);
4441 }
4442 }
4443
4444 template <typename WWViewType>
4445 KOKKOS_INLINE_FUNCTION void
4446 solveMultiVector(const member_type &member,
4447 const local_ordinal_type & /* blocksize */,
4448 const local_ordinal_type &i0,
4449 const local_ordinal_type &r0,
4450 const local_ordinal_type &nrows,
4451 const local_ordinal_type &v,
4452 const WWViewType &WW) const {
4453 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4454
4455 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4456 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4457
4458 // constant
4459#if KOKKOS_VERSION >= 40799
4460 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4461#else
4462 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4463#endif
4464#if KOKKOS_VERSION >= 40799
4465 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
4466#else
4467 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4468#endif
4469
4470 // subview pattern
4471 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4472 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4473 auto X2 = X1;
4474
4475 local_ordinal_type i = i0, r = r0;
4476
4477 if (nrows > 1) {
4478 // solve Lx = x
4479 KB::Trsm<member_type,
4480 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4481 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4482 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
4483 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
4484 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
4485 member.team_barrier();
4486 KB::Gemm<member_type,
4487 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4488 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4489 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
4490 KB::Trsm<member_type,
4491 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4492 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4493 X1.assign_data(X2.data());
4494 }
4495
4496 // solve Ux = x
4497 KB::Trsm<member_type,
4498 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4499 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4500 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4501 i -= 3;
4502 A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
4503 X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
4504 member.team_barrier();
4505 KB::Gemm<member_type,
4506 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4507 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4508
4509 A.assign_data(&D_internal_vector_values(i, 0, 0, v));
4510 KB::Trsm<member_type,
4511 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4512 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4513 X1.assign_data(X2.data());
4514 }
4515 } else {
4516 // matrix is already inverted
4517 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4518 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
4519 member.team_barrier();
4520 KB::Gemm<member_type,
4521 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4522 default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
4523 }
4524 }
4525
4526 template <int B>
4527 struct SingleVectorTag {};
4528 template <int B>
4529 struct MultiVectorTag {};
4530
4531 template <int B>
4532 struct SingleVectorSubLineTag {};
4533 template <int B>
4534 struct SingleVectorApplyCTag {};
4535 template <int B>
4536 struct SingleVectorSchurTag {};
4537 template <int B>
4538 struct SingleVectorApplyETag {};
4539 template <int B>
4540 struct CopyVectorToFlatTag {};
4541 template <int B>
4542 struct SingleZeroingTag {};
4543
4544 template <int B>
4545 KOKKOS_INLINE_FUNCTION void
4546 operator()(const SingleVectorTag<B> &, const member_type &member) const {
4547 const local_ordinal_type packidx = member.league_rank();
4548 const local_ordinal_type partidx = packptr(packidx);
4549 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4550 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4551 const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4552 const local_ordinal_type r0 = part2packrowidx0(partidx);
4553 const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4554 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4555 const local_ordinal_type num_vectors = 1;
4556 internal_vector_scratch_type_3d_view
4557 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4558 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4559 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4560 });
4561 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4562 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4563 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4564 });
4565 }
4566
4567 template <int B>
4568 KOKKOS_INLINE_FUNCTION void
4569 operator()(const MultiVectorTag<B> &, const member_type &member) const {
4570 const local_ordinal_type packidx = member.league_rank();
4571 const local_ordinal_type partidx = packptr(packidx);
4572 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4573 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4574 const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4575 const local_ordinal_type r0 = part2packrowidx0(partidx);
4576 const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4577 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4578 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4579
4580 internal_vector_scratch_type_3d_view
4581 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4582 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4583 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4584 });
4585 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4586 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4587 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4588 });
4589 }
4590
4591 template <int B>
4592 KOKKOS_INLINE_FUNCTION void
4593 operator()(const SingleVectorSubLineTag<B> &, const member_type &member) const {
4594 // btdm is packed and sorted from largest one
4595 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4596
4597 const local_ordinal_type subpartidx = packptr_sub(packidx);
4598 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4599 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4600 const local_ordinal_type partidx = subpartidx % n_parts;
4601
4602 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
4603 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4604 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4605 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4606 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4607
4608 //(void) i0;
4609 //(void) nrows;
4610 (void)npacks;
4611
4612 internal_vector_scratch_type_3d_view
4613 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4614
4615 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4616 auto X_internal_vec = Kokkos::subview(X_internal_vector_values, Kokkos::ALL(), Kokkos::ALL(), active_schur_solve_vec, Kokkos::ALL());
4617 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vec, WW);
4618 });
4619 }
4620
4621 template <int B>
4622 KOKKOS_INLINE_FUNCTION void
4623 operator()(const SingleVectorApplyCTag<B> &, const member_type &member) const {
4624 // btdm is packed and sorted from largest one
4625 // const local_ordinal_type packidx = packindices_schur(member.league_rank());
4626 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4627
4628 const local_ordinal_type subpartidx = packptr_sub(packidx);
4629 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4630 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4631 const local_ordinal_type partidx = subpartidx % n_parts;
4632 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4633
4634 // const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4635 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4636 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4637 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4638
4639 // Compute v_2 = v_2 - C v_1
4640
4641 const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
4642 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
4643 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
4644
4645 (void)i0_schur;
4646 (void)i0_offset;
4647
4648#if KOKKOS_VERSION >= 40799
4649 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4650#else
4651 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4652#endif
4653
4654 const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx) - 2 : 0;
4655 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx + 1) + 1;
4656
4657 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4658
4659 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4660 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4661
4662 if (local_subpartidx == 0) {
4663 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4664 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4665 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4666 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4667
4668 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4669 member,
4670 blocksize, blocksize,
4671 -one,
4672 C.data(), C.stride(0), C.stride(1),
4673 v_1.data(), v_1.stride(0),
4674 one,
4675 v_2.data(), v_2.stride(0));
4676 });
4677 } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4678 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4679 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), active_schur_solve_vec, v);
4680 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4681 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4682
4683 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4684 member,
4685 blocksize, blocksize,
4686 -one,
4687 C.data(), C.stride(0), C.stride(1),
4688 v_1.data(), v_1.stride(0),
4689 one,
4690 v_2.data(), v_2.stride(0));
4691 });
4692 } else {
4693 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4694 {
4695 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4696 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4697 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4698
4699 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4700 member,
4701 blocksize, blocksize,
4702 -one,
4703 C.data(), C.stride(0), C.stride(1),
4704 v_1.data(), v_1.stride(0),
4705 one,
4706 v_2.data(), v_2.stride(0));
4707 }
4708 {
4709 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), active_schur_solve_vec, v);
4710 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4711 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4712
4713 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4714 member,
4715 blocksize, blocksize,
4716 -one,
4717 C.data(), C.stride(0), C.stride(1),
4718 v_1.data(), v_1.stride(0),
4719 one,
4720 v_2.data(), v_2.stride(0));
4721 }
4722 });
4723 }
4724 }
4725
4726 template <int B>
4727 KOKKOS_INLINE_FUNCTION void
4728 operator()(const SingleVectorSchurTag<B> &, const member_type &member) const {
4729 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4730
4731 const local_ordinal_type partidx = packptr_sub(packidx);
4732
4733 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4734
4735 const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx, 0);
4736 const local_ordinal_type nrows = 2 * (n_subparts_per_part - 1);
4737
4738 const local_ordinal_type r0_schur = nrows * member.league_rank();
4739
4740 internal_vector_scratch_type_3d_view
4741 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4742
4743 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4744 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4745 for (local_ordinal_type i = 0; i < 2; ++i) {
4746 copy3DView<local_ordinal_type>(member,
4747 Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL()),
4748 Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), active_schur_solve_vec, Kokkos::ALL()));
4749 }
4750 }
4751
4752 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4753 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4754 });
4755
4756 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4757 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4758 for (local_ordinal_type i = 0; i < 2; ++i) {
4759 copy3DView<local_ordinal_type>(member,
4760 Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), active_schur_solve_vec, Kokkos::ALL()),
4761 Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL()));
4762 }
4763 }
4764 }
4765
4766 template <int B>
4767 KOKKOS_INLINE_FUNCTION void
4768 operator()(const SingleVectorApplyETag<B> &, const member_type &member) const {
4769 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4770
4771 const local_ordinal_type subpartidx = packptr_sub(packidx);
4772 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4773 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4774 const local_ordinal_type partidx = subpartidx % n_parts;
4775 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4776
4777 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4778 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4779
4780 // Compute v_2 = v_2 - C v_1
4781
4782#if KOKKOS_VERSION >= 40799
4783 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4784#else
4785 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4786#endif
4787
4788 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4789
4790 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4791 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4792
4793 if (local_subpartidx == 0) {
4794 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4795 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4796
4797 for (local_ordinal_type row = 0; row < nrows; ++row) {
4798 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4799 auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4800
4801 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4802 member,
4803 blocksize, blocksize,
4804 -one,
4805 E.data(), E.stride(0), E.stride(1),
4806 v_2.data(), v_2.stride(0),
4807 one,
4808 v_1.data(), v_1.stride(0));
4809 }
4810 });
4811 } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4812 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4813 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4814
4815 for (local_ordinal_type row = 0; row < nrows; ++row) {
4816 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4817 auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4818
4819 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4820 member,
4821 blocksize, blocksize,
4822 -one,
4823 E.data(), E.stride(0), E.stride(1),
4824 v_2.data(), v_2.stride(0),
4825 one,
4826 v_1.data(), v_1.stride(0));
4827 }
4828 });
4829 } else {
4830 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4831 {
4832 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4833
4834 for (local_ordinal_type row = 0; row < nrows; ++row) {
4835 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4836 auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4837
4838 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4839 member,
4840 blocksize, blocksize,
4841 -one,
4842 E.data(), E.stride(0), E.stride(1),
4843 v_2.data(), v_2.stride(0),
4844 one,
4845 v_1.data(), v_1.stride(0));
4846 }
4847 }
4848 {
4849 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4850
4851 for (local_ordinal_type row = 0; row < nrows; ++row) {
4852 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4853 auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4854
4855 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4856 member,
4857 blocksize, blocksize,
4858 -one,
4859 E.data(), E.stride(0), E.stride(1),
4860 v_2.data(), v_2.stride(0),
4861 one,
4862 v_1.data(), v_1.stride(0));
4863 }
4864 }
4865 });
4866 }
4867 }
4868
4869 template <int B>
4870 KOKKOS_INLINE_FUNCTION void
4871 operator()(const CopyVectorToFlatTag<B> &, const member_type &member) const {
4872 const local_ordinal_type packidx = member.league_rank();
4873 const local_ordinal_type partidx = packptr(packidx);
4874 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4875 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4876 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4877 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4878
4879 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4880 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4881 });
4882 }
4883
4884 template <int B>
4885 KOKKOS_INLINE_FUNCTION void
4886 operator()(const SingleZeroingTag<B> &, const member_type &member) const {
4887 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4888 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4889 });
4890 }
4891
4892 void run(const impl_scalar_type_2d_view_tpetra &Y,
4893 const impl_scalar_type_1d_view &Z) {
4894 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4895 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SolveTridiags", SolveTridiags);
4896
4898 this->Y_scalar_multivector = Y;
4899 this->Z_scalar_vector = Z;
4900
4901 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4902 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4903
4904 const local_ordinal_type team_size =
4905 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4906 recommended_team_size(blocksize, vector_length, internal_vector_length);
4907 const int per_team_scratch = internal_vector_scratch_type_3d_view ::shmem_size(blocksize, num_vectors, vector_loop_size);
4908
4909#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4910 if (packindices_schur.extent(1) <= 0) { \
4911 if (num_vectors == 1) { \
4912 Kokkos::TeamPolicy<execution_space, SingleVectorTag<B>> \
4913 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4914 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4915 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4916 policy, *this); \
4917 } else { \
4918 Kokkos::TeamPolicy<execution_space, MultiVectorTag<B>> \
4919 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4920 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4921 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<MultiVector>", \
4922 policy, *this); \
4923 } \
4924 } else { \
4925 { \
4926 Kokkos::TeamPolicy<execution_space, SingleZeroingTag<B>> \
4927 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4928 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4929 policy, *this); \
4930 } \
4931 for (local_ordinal_type vec = 0; vec < num_vectors; vec++) { \
4932 this->active_schur_solve_vec = vec; \
4933 { \
4934 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4935 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4936 Kokkos::TeamPolicy<execution_space, SingleVectorSubLineTag<B>> \
4937 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4938 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4939 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4940 policy, *this); \
4941 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4942 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4943 } \
4944 { \
4945 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4946 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4947 Kokkos::TeamPolicy<execution_space, SingleVectorApplyCTag<B>> \
4948 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4949 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4950 policy, *this); \
4951 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4952 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4953 } \
4954 { \
4955 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4956 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4957 Kokkos::TeamPolicy<execution_space, SingleVectorSchurTag<B>> \
4958 policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4959 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4960 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4961 policy, *this); \
4962 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4963 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4964 } \
4965 { \
4966 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4967 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4968 Kokkos::TeamPolicy<execution_space, SingleVectorApplyETag<B>> \
4969 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4970 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4971 policy, *this); \
4972 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4973 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4974 } \
4975 } \
4976 { \
4977 Kokkos::TeamPolicy<execution_space, CopyVectorToFlatTag<B>> \
4978 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4979 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<CopyVectorToFlatTag>", \
4980 policy, *this); \
4981 } \
4982 } \
4983 break
4984 switch (blocksize) {
4985 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(3);
4986 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(5);
4987 case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(6);
4988 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(7);
4989 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4990 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4991 case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4992 case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4993 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4994 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4995 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4996 case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4997 default: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(0);
4998 }
4999#undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
5000
5001 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
5002 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
5003 }
5004};
5005
5009template <typename MatrixType>
5010int applyInverseJacobi( // importer
5011 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
5012 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
5013 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5014 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5015 const bool overlap_communication_and_computation,
5016 // tpetra interface
5017 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5018 /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5019 /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
5020 /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5021 // local object interface
5022 const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5023 const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5024 const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5025 /* */ typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
5027 // preconditioner parameters
5029 /* */ bool is_y_zero,
5030 const int max_num_sweeps,
5031 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5032 const int check_tol_every) {
5033 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
5034
5036 using node_memory_space = typename impl_type::node_memory_space;
5037 using local_ordinal_type = typename impl_type::local_ordinal_type;
5038 using size_type = typename impl_type::size_type;
5039 using impl_scalar_type = typename impl_type::impl_scalar_type;
5040 using magnitude_type = typename impl_type::magnitude_type;
5041 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
5042 using vector_type_1d_view = typename impl_type::vector_type_1d_view;
5043 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
5044 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
5045
5046 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
5047
5048 // either tpetra importer or async importer must be active
5049 TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
5050 "Neither Tpetra importer nor Async importer is null.");
5051 // max number of sweeps should be positive number
5052 TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
5053 "Maximum number of sweeps must be >= 1.");
5054
5055 // const parameters
5056 const bool is_seq_method_requested = !tpetra_importer.is_null();
5057 const bool is_async_importer_active = !async_importer.is_null();
5058#if KOKKOS_VERSION >= 40799
5059 const bool is_norm_manager_active = tol > KokkosKernels::ArithTraits<magnitude_type>::zero();
5060#else
5061 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5062#endif
5063 const magnitude_type tolerance = tol * tol;
5064 const local_ordinal_type blocksize = btdm.values.extent(1);
5065 const local_ordinal_type num_vectors = Y.getNumVectors();
5066 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
5067
5068 const impl_scalar_type zero(0.0);
5069
5070 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
5071 "The seq method for applyInverseJacobi, "
5072 << "which in any case is for developer use only, "
5073 << "does not support norm-based termination.");
5074 const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
5075 Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
5076 TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
5077 std::invalid_argument,
5078 "The seq method for applyInverseJacobi, "
5079 << "which in any case is for developer use only, "
5080 << "only supports memory spaces accessible from host.");
5081
5082 // if workspace is needed more, resize it
5083 const size_type work_span_required = num_blockrows * num_vectors * blocksize;
5084 if (work.span() < work_span_required)
5085 work = vector_type_1d_view("vector workspace 1d view", work_span_required);
5086
5087 // construct W
5088 const local_ordinal_type W_size = interf.packptr.extent(0) - 1;
5089 if (local_ordinal_type(W.extent(0)) < W_size)
5090 W = impl_scalar_type_1d_view("W", W_size);
5091
5092 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5093 {
5094 if (is_seq_method_requested) {
5095 if (Z.getNumVectors() != Y.getNumVectors())
5096 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
5097 } else {
5098 if (is_async_importer_active) {
5099 // create comm data buffer and keep it here
5100 async_importer->createDataBuffer(num_vectors);
5101 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5102 }
5103 }
5104 }
5105
5106 // wrap the workspace with 3d view
5107 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
5108 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5109 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5110 const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
5111 if (is_y_zero) Kokkos::deep_copy(YY, zero);
5112
5113 MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
5114 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
5115 damping_factor, is_norm_manager_active);
5116
5117 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
5118
5119 auto A_crs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_crs_matrix_type>(A);
5120 auto A_bcrs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_block_crs_matrix_type>(A);
5121
5122 bool hasBlockCrsMatrix = !A_bcrs.is_null();
5123
5124 // This is OK here to use the graph of the A_crs matrix and a block size of 1
5125 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
5126
5127 BlockHelperDetails::ComputeResidualVector<MatrixType>
5128 compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
5129 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
5130 hasBlockCrsMatrix);
5131
5132 // norm manager workspace resize
5133 if (is_norm_manager_active)
5134 norm_manager.setCheckFrequency(check_tol_every);
5135
5136 // iterate
5137 int sweep = 0;
5138 for (; sweep < max_num_sweeps; ++sweep) {
5139 {
5140 if (is_y_zero) {
5141 // pmv := x(lclrow)
5142 multivector_converter.run(XX);
5143 } else {
5144 if (is_seq_method_requested) {
5145 // SEQ METHOD IS TESTING ONLY
5146
5147 // y := x - R y
5148 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5149 compute_residual_vector.run(YY, XX, ZZ);
5150
5151 // pmv := y(lclrow).
5152 multivector_converter.run(YY);
5153 } else {
5154 // fused y := x - R y and pmv := y(lclrow);
5155 // real use case does not use overlap comp and comm
5156 if (overlap_communication_and_computation || !is_async_importer_active) {
5157 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5158 // OverlapTag, compute_owned = true
5159 compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
5160 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5161 if (is_async_importer_active) async_importer->cancel();
5162 break;
5163 }
5164 if (is_async_importer_active) {
5165 async_importer->syncRecv();
5166 // OverlapTag, compute_owned = false
5167 compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
5168 }
5169 } else {
5170 if (is_async_importer_active)
5171 async_importer->syncExchange(YY);
5172 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5173 // AsyncTag
5174 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5175 }
5176 }
5177 }
5178 }
5179
5180 // pmv := inv(D) pmv.
5181 {
5182 solve_tridiags.run(YY, W);
5183 }
5184 {
5185 if (is_norm_manager_active) {
5186 // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
5187 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5188 if (sweep + 1 == max_num_sweeps) {
5189 norm_manager.ireduce(sweep, true);
5190 norm_manager.checkDone(sweep + 1, tolerance, true);
5191 } else {
5192 norm_manager.ireduce(sweep);
5193 }
5194 }
5195 }
5196 is_y_zero = false;
5197 }
5198
5199 // sqrt the norms for the caller's use.
5200 if (is_norm_manager_active) norm_manager.finalize();
5201
5202 return sweep;
5203}
5204
5205// Implementation of fused block Jacobi for a specific block size,
5206// or (if B == 0) for a general block size.
5207template <typename MatrixType, int B>
5208int applyFusedBlockJacobi_Impl(
5209 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5210 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5211 const bool overlap_communication_and_computation,
5212 // tpetra interface
5213 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5214 /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5215 /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5216 // local object interface
5217 const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5218 const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5219 const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5220 /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5222 // preconditioner parameters
5224 /* */ bool is_y_zero,
5225 const int max_num_sweeps,
5226 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5227 const int check_tol_every) {
5229 using local_ordinal_type = typename impl_type::local_ordinal_type;
5230 using size_type = typename impl_type::size_type;
5231 using magnitude_type = typename impl_type::magnitude_type;
5232 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
5233 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
5234
5235 // the tpetra importer and async importer can't both be active
5236 TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
5237 "Neither Tpetra importer nor Async importer is null.");
5238 // max number of sweeps should be positive number
5239 TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
5240 "Maximum number of sweeps must be >= 1.");
5241
5242 // const parameters
5243 const bool is_async_importer_active = !async_importer.is_null();
5244#if KOKKOS_VERSION >= 40799
5245 const bool is_norm_manager_active = tol > KokkosKernels::ArithTraits<magnitude_type>::zero();
5246#else
5247 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5248#endif
5249 const magnitude_type tolerance = tol * tol;
5250 const local_ordinal_type blocksize = btdm.d_inv.extent(1);
5251 const local_ordinal_type num_vectors = Y.getNumVectors();
5252 const local_ordinal_type num_blockrows = interf.nparts;
5253
5254 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5255 {
5256 if (is_async_importer_active) {
5257 // create comm data buffer and keep it here
5258 async_importer->createDataBuffer(num_vectors);
5259 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5260 }
5261 }
5262
5263 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5264 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5265
5266 const bool two_pass_residual =
5267 overlap_communication_and_computation && is_async_importer_active;
5268
5269 // Calculate the required work size and reallocate it if not already big enough.
5270 // Check that our assumptions about YY dimension are correct.
5271 TEUCHOS_TEST_FOR_EXCEPT_MSG(
5272 size_t(num_blockrows) * blocksize * num_vectors != YY.extent(0) * YY.extent(1),
5273 "Local LHS vector (YY) has total size " << YY.extent(0) << "x" << YY.extent(1) << " = " << YY.extent(0) * YY.extent(1) << ",\n"
5274 << "but expected " << num_blockrows << "x" << blocksize << "x" << num_vectors << " = " << size_t(num_blockrows) * blocksize * num_vectors << '\n');
5275 size_type work_required = size_type(num_blockrows) * blocksize * num_vectors;
5276 if (work.extent(0) < work_required) {
5277 work = impl_scalar_type_1d_view(do_not_initialize_tag("flat workspace 1d view"), work_required);
5278 }
5279
5280 Unmanaged<impl_scalar_type_2d_view_tpetra> y_doublebuf(work.data(), num_blockrows * blocksize, num_vectors);
5281
5282 // construct W
5283 if (W.extent(0) != size_t(num_blockrows))
5284 W = impl_scalar_type_1d_view(do_not_initialize_tag("W"), num_blockrows);
5285
5286 // Create the required functors upfront (this is inexpensive - all shallow copies)
5287 BlockHelperDetails::ComputeResidualAndSolve_SolveOnly<MatrixType, B>
5288 functor_solve_only(amd, btdm.d_inv, W, blocksize, damping_factor);
5289 BlockHelperDetails::ComputeResidualAndSolve_1Pass<MatrixType, B>
5290 functor_1pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5291 BlockHelperDetails::ComputeResidualAndSolve_2Pass<MatrixType, B>
5292 functor_2pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5293
5294 // norm manager workspace resize
5295 if (is_norm_manager_active)
5296 norm_manager.setCheckFrequency(check_tol_every);
5297
5298 // For double-buffering.
5299 // yy_buffers[current_y] has the current iterate of y.
5300 // yy_buffers[1-current_y] has the next iterate of y.
5301 Unmanaged<impl_scalar_type_2d_view_tpetra> y_buffers[2] = {YY, y_doublebuf};
5302 int current_y = 0;
5303
5304 // iterate
5305 int sweep = 0;
5306 for (; sweep < max_num_sweeps; ++sweep) {
5307 if (is_y_zero) {
5308 // If y is initially zero, then we are just computing y := damping_factor * Dinv * x
5309 functor_solve_only.run(XX, y_buffers[1 - current_y]);
5310 } else {
5311 // real use case does not use overlap comp and comm
5312 if (overlap_communication_and_computation || !is_async_importer_active) {
5313 if (is_async_importer_active) async_importer->asyncSendRecv(y_buffers[current_y]);
5314 if (two_pass_residual) {
5315 // Pass 1 computes owned residual and stores into new y buffer,
5316 // but doesn't apply Dinv or produce a norm yet
5317 functor_2pass.run_pass1(XX, y_buffers[current_y], y_buffers[1 - current_y]);
5318 } else {
5319 // This case happens if running with single rank.
5320 // There are no remote columns, so residual and solve can happen in one step.
5321 functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5322 }
5323 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5324 if (is_async_importer_active) async_importer->cancel();
5325 break;
5326 }
5327 if (is_async_importer_active) {
5328 async_importer->syncRecv();
5329 // Stage 2 finishes computing the residual, then applies Dinv and computes norm.
5330 functor_2pass.run_pass2(y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5331 }
5332 } else {
5333 if (is_async_importer_active)
5334 async_importer->syncExchange(y_buffers[current_y]);
5335 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5336 // Full residual, Dinv apply, and norm in one kernel
5337 functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5338 }
5339 }
5340
5341 // Compute global norm.
5342 if (is_norm_manager_active) {
5343 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5344 if (sweep + 1 == max_num_sweeps) {
5345 norm_manager.ireduce(sweep, true);
5346 norm_manager.checkDone(sweep + 1, tolerance, true);
5347 } else {
5348 norm_manager.ireduce(sweep);
5349 }
5350 }
5351 is_y_zero = false;
5352 // flip y buffers for next iteration, or termination if we reached max_num_sweeps.
5353 current_y = 1 - current_y;
5354 }
5355 if (current_y == 1) {
5356 // We finished iterating with y in the double buffer, so copy it to the user's vector.
5357 Kokkos::deep_copy(YY, y_doublebuf);
5358 }
5359
5360 // sqrt the norms for the caller's use.
5361 if (is_norm_manager_active) norm_manager.finalize();
5362 return sweep;
5363}
5364
5368template <typename MatrixType>
5370 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5371 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5372 const bool overlap_communication_and_computation,
5373 // tpetra interface
5374 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5375 /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5376 /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5377 // local object interface
5378 const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5379 const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5380 const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5381 /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5383 // preconditioner parameters
5385 /* */ bool is_y_zero,
5386 const int max_num_sweeps,
5387 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5388 const int check_tol_every) {
5389 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyFusedBlockJacobi", ApplyFusedBlockJacobi);
5390 int blocksize = btdm.d_inv.extent(1);
5391 int sweep = 0;
5392#define BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(B) \
5393 { \
5394 sweep = applyFusedBlockJacobi_Impl<MatrixType, B>( \
5395 tpetra_importer, async_importer, overlap_communication_and_computation, \
5396 X, Y, W, interf, btdm, amd, work, \
5397 norm_manager, damping_factor, is_y_zero, \
5398 max_num_sweeps, tol, check_tol_every); \
5399 } \
5400 break
5401 switch (blocksize) {
5402 case 3: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(3);
5403 case 5: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(5);
5404 case 7: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(7);
5405 case 9: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(9);
5406 case 10: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(10);
5407 case 11: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(11);
5408 case 16: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(16);
5409 case 17: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(17);
5410 case 18: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(18);
5411 default: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(0);
5412 }
5413#undef BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI
5414
5415 return sweep;
5416}
5417
5418template <typename MatrixType>
5421 using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5425 using async_import_type = AsyncableImport<MatrixType>;
5426
5427 // distructed objects
5428 Teuchos::RCP<const typename impl_type::tpetra_row_matrix_type> A;
5429 Teuchos::RCP<const typename impl_type::tpetra_crs_graph_type> blockGraph;
5430 Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
5431 Teuchos::RCP<async_import_type> async_importer;
5432 bool overlap_communication_and_computation;
5433
5434 // copy of Y (mutable to penentrate const)
5435 mutable typename impl_type::tpetra_multivector_type Z;
5436 mutable typename impl_type::impl_scalar_type_1d_view W;
5437
5438 // local objects
5439 part_interface_type part_interface;
5440 block_tridiags_type block_tridiags; // D
5441 amd_type a_minus_d; // R = A - D
5442
5443 // whether to use fused block Jacobi path
5444 bool use_fused_jacobi;
5445
5446 // vector workspace is used for general block tridi case
5447 mutable typename impl_type::vector_type_1d_view work; // right hand side workspace (1D view of vector)
5448 // scalar workspace is used for fused block jacobi case
5449 mutable typename impl_type::impl_scalar_type_1d_view work_flat; // right hand side workspace (1D view of scalar)
5450 mutable norm_manager_type norm_manager;
5451};
5452
5453} // namespace BlockTriDiContainerDetails
5454
5455} // namespace Ifpack2
5456
5457#endif
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type > > &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1102
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:5010
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, bool useSeqMethod, bool use_fused_jacobi)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1938
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:171
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny, bool use_fused_jacobi)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:3764
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:895
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition Ifpack2_BlockTriDiContainer_impl.hpp:99
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1686
int applyFusedBlockJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:5369
Preconditioners and smoothers for Tpetra sparse matrices.
Definition Ifpack2_AdditiveSchwarz_decl.hpp:40
Definition Ifpack2_BlockComputeResidualVector.hpp:23
Definition Ifpack2_BlockHelper.hpp:211
Definition Ifpack2_BlockHelper.hpp:270
size_t size_type
Definition Ifpack2_BlockHelper.hpp:274
Kokkos::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition Ifpack2_BlockHelper.hpp:286
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition Ifpack2_BlockHelper.hpp:358
Definition Ifpack2_BlockHelper.hpp:389
Definition Ifpack2_BlockHelper.hpp:236
Definition Ifpack2_BlockTriDiContainer_impl.hpp:143
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1613
forward declaration
Definition Ifpack2_BlockTriDiContainer_impl.hpp:5419
Definition Ifpack2_BlockTriDiContainer_impl.hpp:3814