10#ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11#define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
16#include <Teuchos_Details_MpiTypeTraits.hpp>
18#include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19#include <Tpetra_Distributor.hpp>
20#include <Tpetra_BlockMultiVector.hpp>
22#include <KokkosKernels_ArithTraits.hpp>
23#include <KokkosBatched_Util.hpp>
24#include <KokkosBatched_Vector.hpp>
25#include <KokkosBatched_Copy_Decl.hpp>
26#include <KokkosBatched_Copy_Impl.hpp>
27#include <KokkosBatched_AddRadial_Decl.hpp>
28#include <KokkosBatched_AddRadial_Impl.hpp>
29#include <KokkosBatched_SetIdentity_Decl.hpp>
30#include <KokkosBatched_SetIdentity_Impl.hpp>
31#include <KokkosBatched_Gemm_Decl.hpp>
32#include <KokkosBatched_Gemm_Serial_Impl.hpp>
33#include <KokkosBatched_Gemm_Team_Impl.hpp>
34#include <KokkosBatched_Gemv_Decl.hpp>
35#include <KokkosBatched_Gemv_Team_Impl.hpp>
36#include <KokkosBatched_Trsm_Decl.hpp>
37#include <KokkosBatched_Trsm_Serial_Impl.hpp>
38#include <KokkosBatched_Trsm_Team_Impl.hpp>
39#include <KokkosBatched_Trsv_Decl.hpp>
40#include <KokkosBatched_Trsv_Serial_Impl.hpp>
41#include <KokkosBatched_Trsv_Team_Impl.hpp>
42#include <KokkosBatched_LU_Decl.hpp>
43#include <KokkosBatched_LU_Serial_Impl.hpp>
44#include <KokkosBatched_LU_Team_Impl.hpp>
46#include <KokkosBlas1_nrm1.hpp>
47#include <KokkosBlas1_nrm2.hpp>
51#include "Ifpack2_BlockHelper.hpp"
52#include "Ifpack2_BlockComputeResidualVector.hpp"
53#include "Ifpack2_BlockComputeResidualAndSolve.hpp"
58#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
59#include "cuda_profiler_api.h"
64#define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
72#define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
76#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
79#if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
80#define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
84#define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
88namespace BlockTriDiContainerDetails {
90namespace KB = KokkosBatched;
95using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
97template <
typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
98using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
99 MemoryTraitsType::is_random_access |
102template <
typename ViewType>
103using Unmanaged = Kokkos::View<
typename ViewType::data_type,
104 typename ViewType::array_layout,
105 typename ViewType::device_type,
106 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
107template <
typename ViewType>
108using Atomic = Kokkos::View<
typename ViewType::data_type,
109 typename ViewType::array_layout,
110 typename ViewType::device_type,
111 MemoryTraits<typename ViewType::memory_traits, Kokkos::Atomic>>;
112template <
typename ViewType>
113using Const = Kokkos::View<
typename ViewType::const_data_type,
114 typename ViewType::array_layout,
115 typename ViewType::device_type,
116 typename ViewType::memory_traits>;
117template <
typename ViewType>
118using ConstUnmanaged = Const<Unmanaged<ViewType>>;
120template <
typename ViewType>
121using AtomicUnmanaged = Atomic<Unmanaged<ViewType>>;
123template <
typename ViewType>
124using Unmanaged = Kokkos::View<
typename ViewType::data_type,
125 typename ViewType::array_layout,
126 typename ViewType::device_type,
127 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
129template <
typename ViewType>
130using Scratch = Kokkos::View<
typename ViewType::data_type,
131 typename ViewType::array_layout,
132 typename ViewType::execution_space::scratch_memory_space,
133 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
142#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
150#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
151#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
152 KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
154#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
155 { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStop()); }
158#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
159#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
165template <
typename MatrixType>
166typename Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type>
167createBlockCrsTpetraImporter(
const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
168 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
170 using tpetra_map_type =
typename impl_type::tpetra_map_type;
171 using tpetra_mv_type =
typename impl_type::tpetra_block_multivector_type;
172 using tpetra_import_type =
typename impl_type::tpetra_import_type;
173 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
174 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
176 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
177 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
179 bool hasBlockCrsMatrix = !A_bcrs.is_null();
182 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
184 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
185 const auto src = Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
186 const auto tgt = Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap(), blocksize)));
187 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
188 return Teuchos::rcp(
new tpetra_import_type(src, tgt));
196template <
typename MatrixType>
197struct AsyncableImport {
199 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
205#if !defined(HAVE_IFPACK2_MPI)
206 typedef int MPI_Request;
207 typedef int MPI_Comm;
211 using scalar_type =
typename impl_type::scalar_type;
213 static int isend(
const MPI_Comm comm,
const char *buf,
int count,
int dest,
int tag, MPI_Request *ireq) {
214#ifdef HAVE_IFPACK2_MPI
216 int ret = MPI_Isend(
const_cast<char *
>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
217 if (ireq == NULL) MPI_Request_free(&ureq);
224 static int irecv(
const MPI_Comm comm,
char *buf,
int count,
int src,
int tag, MPI_Request *ireq) {
225#ifdef HAVE_IFPACK2_MPI
227 int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
228 if (ireq == NULL) MPI_Request_free(&ureq);
235 static int waitany(
int count, MPI_Request *reqs,
int *index) {
236#ifdef HAVE_IFPACK2_MPI
237 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
243 static int waitall(
int count, MPI_Request *reqs) {
244#ifdef HAVE_IFPACK2_MPI
245 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
252 using tpetra_map_type =
typename impl_type::tpetra_map_type;
253 using tpetra_import_type =
typename impl_type::tpetra_import_type;
255 using local_ordinal_type =
typename impl_type::local_ordinal_type;
256 using global_ordinal_type =
typename impl_type::global_ordinal_type;
260 using int_1d_view_host = Kokkos::View<int *, Kokkos::HostSpace>;
261 using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type *, Kokkos::HostSpace>;
263 using execution_space =
typename impl_type::execution_space;
264 using memory_space =
typename impl_type::memory_space;
265 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
267 using size_type_1d_view_host = Kokkos::View<size_type *, Kokkos::HostSpace>;
269#if defined(KOKKOS_ENABLE_CUDA)
270 using impl_scalar_type_1d_view =
271 typename std::conditional<std::is_same<execution_space, Kokkos::Cuda>::value,
272#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
273 Kokkos::View<impl_scalar_type *, Kokkos::CudaHostPinnedSpace>,
274#elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
275 Kokkos::View<impl_scalar_type *, Kokkos::CudaSpace>,
277 typename impl_type::impl_scalar_type_1d_view,
279 typename impl_type::impl_scalar_type_1d_view>::type;
281 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
283 using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type *, Kokkos::HostSpace>;
284 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
285 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
287#ifdef HAVE_IFPACK2_MPI
291 impl_scalar_type_2d_view_tpetra remote_multivector;
292 local_ordinal_type blocksize;
294 template <
typename T>
295 struct SendRecvPair {
300 SendRecvPair<int_1d_view_host> pids;
301 SendRecvPair<std::vector<MPI_Request>> reqs;
302 SendRecvPair<size_type_1d_view> offset;
303 SendRecvPair<size_type_1d_view_host> offset_host;
304 SendRecvPair<local_ordinal_type_1d_view> lids;
305 SendRecvPair<impl_scalar_type_1d_view> buffer;
306 SendRecvPair<impl_scalar_type_1d_view_host> buffer_host;
308 local_ordinal_type_1d_view dm2cm;
310#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
311 using exec_instance_1d_std_vector = std::vector<execution_space>;
312 exec_instance_1d_std_vector exec_instances;
317 void setOffsetValues(
const Teuchos::ArrayView<const size_t> &lens,
318 const size_type_1d_view &offs) {
320 Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(
const_cast<size_t *
>(lens.getRawPtr()), lens.size());
321 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
324 const Kokkos::RangePolicy<execution_space> policy(0, offs.extent(0));
325 const local_ordinal_type lens_size = lens_device.extent(0);
326 Kokkos::parallel_scan(
327 "AsyncableImport::RangePolicy::setOffsetValues",
328 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
331 update += (i < lens_size ? lens_device[i] : 0);
335 void setOffsetValuesHost(
const Teuchos::ArrayView<const size_t> &lens,
336 const size_type_1d_view_host &offs) {
338 Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(
const_cast<size_t *
>(lens.getRawPtr()), lens.size());
339 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
343 for (local_ordinal_type i = 1, iend = offs.extent(0); i < iend; ++i) {
344 offs(i) = offs(i - 1) + lens[i - 1];
349 void createMpiRequests(
const tpetra_import_type &
import) {
350 Tpetra::Distributor &distributor =
import.getDistributor();
353 const auto pids_from = distributor.getProcsFrom();
354 pids.recv = int_1d_view_host(do_not_initialize_tag(
"pids recv"), pids_from.size());
355 memcpy(pids.recv.data(), pids_from.getRawPtr(),
sizeof(
int) * pids.recv.extent(0));
357 const auto pids_to = distributor.getProcsTo();
358 pids.send = int_1d_view_host(do_not_initialize_tag(
"pids send"), pids_to.size());
359 memcpy(pids.send.data(), pids_to.getRawPtr(),
sizeof(
int) * pids.send.extent(0));
362 reqs.recv.resize(pids.recv.extent(0));
363 memset(reqs.recv.data(), 0, reqs.recv.size() *
sizeof(MPI_Request));
364 reqs.send.resize(pids.send.extent(0));
365 memset(reqs.send.data(), 0, reqs.send.size() *
sizeof(MPI_Request));
369 const auto lengths_to = distributor.getLengthsTo();
370 offset.send = size_type_1d_view(do_not_initialize_tag(
"offset send"), lengths_to.size() + 1);
372 const auto lengths_from = distributor.getLengthsFrom();
373 offset.recv = size_type_1d_view(do_not_initialize_tag(
"offset recv"), lengths_from.size() + 1);
375 setOffsetValues(lengths_to, offset.send);
376 offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
378 setOffsetValues(lengths_from, offset.recv);
379 offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
381 const auto lengths_to = distributor.getLengthsTo();
382 offset_host.send = size_type_1d_view_host(do_not_initialize_tag(
"offset send"), lengths_to.size() + 1);
384 const auto lengths_from = distributor.getLengthsFrom();
385 offset_host.recv = size_type_1d_view_host(do_not_initialize_tag(
"offset recv"), lengths_from.size() + 1);
387 setOffsetValuesHost(lengths_to, offset_host.send);
390 setOffsetValuesHost(lengths_from, offset_host.recv);
395 void createSendRecvIDs(
const tpetra_import_type &
import) {
397 const auto remote_lids =
import.getRemoteLIDs();
398 const local_ordinal_type_1d_view_host
399 remote_lids_view_host(
const_cast<local_ordinal_type *
>(remote_lids.getRawPtr()), remote_lids.size());
400 lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag(
"lids recv"), remote_lids.size());
401 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
404 auto epids =
import.getExportPIDs();
405 auto elids =
import.getExportLIDs();
406 TEUCHOS_ASSERT(epids.size() == elids.size());
407 lids.send = local_ordinal_type_1d_view(do_not_initialize_tag(
"lids send"), elids.size());
408 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
411 for (local_ordinal_type cnt = 0, i = 0, iend = pids.send.extent(0); i < iend; ++i) {
412 const auto pid_send_value = pids.send[i];
413 for (local_ordinal_type j = 0, jend = epids.size(); j < jend; ++j)
414 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
415 TEUCHOS_ASSERT(
static_cast<size_t>(cnt) == offset_host.send[i + 1]);
417 Kokkos::deep_copy(lids.send, lids_send_host);
420 void createExecutionSpaceInstances() {
421#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
424 Kokkos::Experimental::partition_space(execution_space(), std::vector<int>(8, 1));
431 struct ToMultiVector {};
433 AsyncableImport(
const Teuchos::RCP<const tpetra_map_type> &src_map,
434 const Teuchos::RCP<const tpetra_map_type> &tgt_map,
435 const local_ordinal_type blocksize_,
436 const local_ordinal_type_1d_view dm2cm_) {
437 blocksize = blocksize_;
440#ifdef HAVE_IFPACK2_MPI
441 comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
443 const tpetra_import_type
import(src_map, tgt_map);
445 createMpiRequests(
import);
446 createSendRecvIDs(
import);
447 createExecutionSpaceInstances();
450 void createDataBuffer(
const local_ordinal_type &num_vectors) {
451 const size_type extent_0 = lids.recv.extent(0) * blocksize;
452 const size_type extent_1 = num_vectors;
453 if (remote_multivector.extent(0) == extent_0 &&
454 remote_multivector.extent(1) == extent_1) {
458 impl_scalar_type_2d_view_tpetra(do_not_initialize_tag(
"remote multivector"), extent_0, extent_1);
460 const auto send_buffer_size = offset_host.send[offset_host.send.extent(0) - 1] * blocksize * num_vectors;
461 const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0) - 1] * blocksize * num_vectors;
463 buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag(
"buffer send"), send_buffer_size);
464 buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag(
"buffer recv"), recv_buffer_size);
466 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
467 buffer_host.send = impl_scalar_type_1d_view_host(do_not_initialize_tag(
"buffer send"), send_buffer_size);
468 buffer_host.recv = impl_scalar_type_1d_view_host(do_not_initialize_tag(
"buffer recv"), recv_buffer_size);
474#ifdef HAVE_IFPACK2_MPI
475 waitall(reqs.recv.size(), reqs.recv.data());
476 waitall(reqs.send.size(), reqs.send.data());
484#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
485 template <
typename PackTag>
486 static void copy(
const local_ordinal_type_1d_view &lids_,
487 const impl_scalar_type_1d_view &buffer_,
488 const local_ordinal_type ibeg_,
489 const local_ordinal_type iend_,
490 const impl_scalar_type_2d_view_tpetra &multivector_,
491 const local_ordinal_type blocksize_,
492 const execution_space &exec_instance_) {
493 const local_ordinal_type num_vectors = multivector_.extent(1);
494 const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
495 const local_ordinal_type idiff = iend_ - ibeg_;
496 const auto abase = buffer_.data() + mv_blocksize * ibeg_;
498 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
499 local_ordinal_type vector_size(0);
502 else if (blocksize_ <= 8)
504 else if (blocksize_ <= 16)
509 const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
510 const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
511 Kokkos::parallel_for(
512 Kokkos::Experimental::require(policy, work_item_property),
513 KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
514 const local_ordinal_type i = member.league_rank();
515 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](
const local_ordinal_type &j) {
516 auto aptr = abase + blocksize_ * (i + idiff * j);
517 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
518 if (std::is_same<PackTag, ToBuffer>::value)
519 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](
const local_ordinal_type &k) {
523 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](
const local_ordinal_type &k) {
530 void asyncSendRecvVar1(
const impl_scalar_type_2d_view_tpetra &mv) {
531 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
533#ifdef HAVE_IFPACK2_MPI
535 const local_ordinal_type num_vectors = mv.extent(1);
536 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
539 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
540 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
542 reinterpret_cast<char *
>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
543 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize *
sizeof(impl_scalar_type),
549 reinterpret_cast<char *
>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
550 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize *
sizeof(impl_scalar_type),
558 execution_space().fence();
561 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
563 if (i < 8) exec_instances[i % 8].fence();
564 copy<ToBuffer>(lids.send, buffer.send,
565 offset_host.send(i), offset_host.send(i + 1),
568 exec_instances[i % 8]);
569 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
571 const local_ordinal_type num_vectors = mv.extent(1);
572 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
574 Kokkos::deep_copy(exec_instances[i % 8],
575 Kokkos::subview(buffer_host.send,
576 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
577 offset_host.send(i) * mv_blocksize,
578 offset_host.send(i + 1) * mv_blocksize)),
579 Kokkos::subview(buffer.send,
580 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
581 offset_host.send(i) * mv_blocksize,
582 offset_host.send(i + 1) * mv_blocksize)));
587 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
589 if (i < 8) exec_instances[i % 8].fence();
590 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
592 reinterpret_cast<const char *
>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
593 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize *
sizeof(impl_scalar_type),
599 reinterpret_cast<const char *
>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
600 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize *
sizeof(impl_scalar_type),
608 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
611 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
614 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
617 void syncRecvVar1() {
618 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
619#ifdef HAVE_IFPACK2_MPI
621 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.recv.extent(0)); ++i) {
622 local_ordinal_type idx = i;
625 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
627 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
628 const local_ordinal_type num_vectors = remote_multivector.extent(1);
629 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
632 Kokkos::subview(buffer.recv,
633 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
634 offset_host.recv(idx) * mv_blocksize,
635 offset_host.recv(idx + 1) * mv_blocksize)),
636 Kokkos::subview(buffer_host.recv,
637 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
638 offset_host.recv(idx) * mv_blocksize,
639 offset_host.recv(idx + 1) * mv_blocksize)));
643 copy<ToMultiVector>(lids.recv, buffer.recv,
644 offset_host.recv(idx), offset_host.recv(idx + 1),
645 remote_multivector, blocksize,
646 exec_instances[idx % 8]);
653 waitall(reqs.send.size(), reqs.send.data());
655 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
664 template <
typename PackTag>
665 static void copy(
const local_ordinal_type_1d_view &lids_,
666 const impl_scalar_type_1d_view &buffer_,
667 const local_ordinal_type &ibeg_,
668 const local_ordinal_type &iend_,
669 const impl_scalar_type_2d_view_tpetra &multivector_,
670 const local_ordinal_type blocksize_) {
671 const local_ordinal_type num_vectors = multivector_.extent(1);
672 const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
673 const local_ordinal_type idiff = iend_ - ibeg_;
674 const auto abase = buffer_.data() + mv_blocksize * ibeg_;
675 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
676 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
677 local_ordinal_type vector_size(0);
680 else if (blocksize_ <= 8)
682 else if (blocksize_ <= 16)
686 const team_policy_type policy(idiff, 1, vector_size);
687 Kokkos::parallel_for(
688 "AsyncableImport::TeamPolicy::copy",
689 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
690 const local_ordinal_type i = member.league_rank();
691 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](
const local_ordinal_type &j) {
692 auto aptr = abase + blocksize_ * (i + idiff * j);
693 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
694 if (std::is_same<PackTag, ToBuffer>::value)
695 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](
const local_ordinal_type &k) {
699 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](
const local_ordinal_type &k) {
705 const Kokkos::RangePolicy<execution_space> policy(0, idiff * num_vectors);
706 Kokkos::parallel_for(
707 "AsyncableImport::RangePolicy::copy",
708 policy, KOKKOS_LAMBDA(
const local_ordinal_type &ij) {
709 const local_ordinal_type i = ij % idiff;
710 const local_ordinal_type j = ij / idiff;
711 auto aptr = abase + blocksize_ * (i + idiff * j);
712 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
713 auto from = std::is_same<PackTag, ToBuffer>::value ? bptr : aptr;
714 auto to = std::is_same<PackTag, ToBuffer>::value ? aptr : bptr;
715 memcpy(to, from,
sizeof(impl_scalar_type) * blocksize_);
723 void asyncSendRecvVar0(
const impl_scalar_type_2d_view_tpetra &mv) {
724 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
726#ifdef HAVE_IFPACK2_MPI
728 const local_ordinal_type num_vectors = mv.extent(1);
729 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
732 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
733 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
735 reinterpret_cast<char *
>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
736 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize *
sizeof(impl_scalar_type),
742 reinterpret_cast<char *
>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
743 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize *
sizeof(impl_scalar_type),
751 for (local_ordinal_type i = 0, iend = pids.send.extent(0); i < iend; ++i) {
752 copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i + 1),
755 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
757 reinterpret_cast<const char *
>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
758 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize *
sizeof(impl_scalar_type),
764 Kokkos::subview(buffer_host.send,
765 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
766 offset_host.send(i) * mv_blocksize,
767 offset_host.send(i + 1) * mv_blocksize)),
768 Kokkos::subview(buffer.send,
769 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
770 offset_host.send(i) * mv_blocksize,
771 offset_host.send(i + 1) * mv_blocksize)));
773 reinterpret_cast<const char *
>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
774 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize *
sizeof(impl_scalar_type),
783 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
786 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
789 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
792 void syncRecvVar0() {
793 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
794#ifdef HAVE_IFPACK2_MPI
796 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
797 local_ordinal_type idx = i;
798 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
799 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
800 const local_ordinal_type num_vectors = remote_multivector.extent(1);
801 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
803 Kokkos::subview(buffer.recv,
804 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
805 offset_host.recv(idx) * mv_blocksize,
806 offset_host.recv(idx + 1) * mv_blocksize)),
807 Kokkos::subview(buffer_host.recv,
808 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
809 offset_host.recv(idx) * mv_blocksize,
810 offset_host.recv(idx + 1) * mv_blocksize)));
812 copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx + 1),
813 remote_multivector, blocksize);
816 waitall(reqs.send.size(), reqs.send.data());
818 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
824 void asyncSendRecv(
const impl_scalar_type_2d_view_tpetra &mv) {
825#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
826#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
827 asyncSendRecvVar1(mv);
829 asyncSendRecvVar0(mv);
832 asyncSendRecvVar0(mv);
836#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
837#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
847 void syncExchange(
const impl_scalar_type_2d_view_tpetra &mv) {
848 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
851 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
854 impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView()
const {
return remote_multivector; }
857template <
typename ViewType1,
typename ViewType2>
858struct are_same_struct {
862 are_same_struct(ViewType1 keys1_, ViewType2 keys2_)
865 KOKKOS_INLINE_FUNCTION
866 void operator()(
int i,
unsigned int &count)
const {
867 if (keys1(i) != keys2(i)) count++;
871template <
typename ViewType1,
typename ViewType2>
872bool are_same(ViewType1 keys1, ViewType2 keys2) {
873 unsigned int are_same_ = 0;
875 Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
876 are_same_struct(keys1, keys2),
878 return are_same_ == 0;
884template <
typename MatrixType>
885Teuchos::RCP<AsyncableImport<MatrixType>>
886createBlockCrsAsyncImporter(
const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
887 IFPACK2_BLOCKHELPER_TIMER(
"createBlockCrsAsyncImporter", createBlockCrsAsyncImporter);
888 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
889 using tpetra_map_type =
typename impl_type::tpetra_map_type;
890 using local_ordinal_type =
typename impl_type::local_ordinal_type;
891 using global_ordinal_type =
typename impl_type::global_ordinal_type;
892 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
893 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
894 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
895 using global_indices_array_device_type = Kokkos::View<const global_ordinal_type *, typename tpetra_map_type::device_type>;
897 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
898 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
900 bool hasBlockCrsMatrix = !A_bcrs.is_null();
903 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
905 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
906 const auto domain_map = g.getDomainMap();
907 const auto column_map = g.getColMap();
909 std::vector<global_ordinal_type> gids;
911 Kokkos::Subview<global_indices_array_device_type, std::pair<int, int>> column_map_global_iD_last;
913 bool separate_remotes =
true, found_first =
false, need_owned_permutation =
false;
915 IFPACK2_BLOCKHELPER_TIMER(
"createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
917 global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
918 global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
920 if (are_same(domain_map_global_iD, column_map_global_iD)) {
922 separate_remotes =
true;
923 need_owned_permutation =
false;
925 column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
926 std::pair<int, int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
929 for (
size_t i = 0; i < column_map->getLocalNumElements(); ++i) {
930 const global_ordinal_type gid = column_map->getGlobalElement(i);
931 if (!domain_map->isNodeGlobalElement(gid)) {
934 }
else if (found_first) {
935 separate_remotes =
false;
938 if (!found_first && !need_owned_permutation &&
939 domain_map->getLocalElement(gid) !=
static_cast<local_ordinal_type
>(i)) {
948 need_owned_permutation =
true;
952 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
955 if (separate_remotes) {
956 IFPACK2_BLOCKHELPER_TIMER(
"createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
957 const auto invalid = Teuchos::OrdinalTraits<global_ordinal_type>::invalid();
958 const auto parsimonious_col_map = need_owned_permutation ? Teuchos::rcp(
new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())) : Teuchos::rcp(new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
959 if (parsimonious_col_map->getGlobalNumElements() > 0) {
961 local_ordinal_type_1d_view dm2cm;
962 if (need_owned_permutation) {
963 dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag(
"dm2cm"), domain_map->getLocalNumElements());
964 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
965 for (
size_t i = 0; i < domain_map->getLocalNumElements(); ++i)
966 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
967 Kokkos::deep_copy(dm2cm, dm2cm_host);
969 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
970 return Teuchos::rcp(
new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
973 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
974 return Teuchos::null;
977template <
typename local_ordinal_type>
978local_ordinal_type costTRSM(
const local_ordinal_type block_size) {
979 return block_size * block_size;
982template <
typename local_ordinal_type>
983local_ordinal_type costGEMV(
const local_ordinal_type block_size) {
984 return 2 * block_size * block_size;
987template <
typename local_ordinal_type>
988local_ordinal_type costTriDiagSolve(
const local_ordinal_type subline_length,
const local_ordinal_type block_size) {
989 return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length - 1) * costGEMV(block_size);
992template <
typename local_ordinal_type>
993local_ordinal_type costSolveSchur(
const local_ordinal_type num_parts,
994 const local_ordinal_type num_teams,
995 const local_ordinal_type line_length,
996 const local_ordinal_type block_size,
997 const local_ordinal_type n_subparts_per_part) {
998 const local_ordinal_type subline_length = ceil(
double(line_length - (n_subparts_per_part - 1) * 2) / n_subparts_per_part);
999 if (subline_length < 1) {
1003 const local_ordinal_type p_n_lines = ceil(
double(num_parts) / num_teams);
1004 const local_ordinal_type p_n_sublines = ceil(
double(n_subparts_per_part) * num_parts / num_teams);
1005 const local_ordinal_type p_n_sublines_2 = ceil(
double(n_subparts_per_part - 1) * num_parts / num_teams);
1007 const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1008 const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part - 1) * 2, block_size);
1009 const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length, block_size);
1010 const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1012 if (n_subparts_per_part == 1) {
1013 return p_costApplyAinv;
1015 return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1018template <
typename local_ordinal_type>
1019local_ordinal_type getAutomaticNSubparts(
const local_ordinal_type num_parts,
1020 const local_ordinal_type num_teams,
1021 const local_ordinal_type line_length,
1022 const local_ordinal_type block_size) {
1029 double parallelismSurplus = Kokkos::sqrt((
double)num_teams / num_parts);
1030 double logLineLength = Kokkos::log2((
double)line_length);
1031 (void)logLineLength;
1033#if defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU)
1035 double modeled = -9.2312 + 4.6946 * parallelismSurplus + 0.4095 * block_size + 0.966 * logLineLength;
1037 if (parallelismSurplus < 0.3)
1039#elif defined(KOKKOS_ARCH_HOPPER) || defined(KOKKOS_ARCH_BLACKWELL)
1041 double modeled = -9.6053 + 4.7477 * parallelismSurplus + 0.2338 * block_size + 1.0794 * logLineLength;
1043 double maxSplit = (double)line_length / 8;
1044 if (modeled > maxSplit)
1046#elif defined(KOKKOS_ENABLE_CUDA)
1050 if (parallelismSurplus > 1 && line_length > 64)
1052#elif defined(KOKKOS_ENABLE_HIP)
1054 double modeled = -8.6214 + 7.3468 * parallelismSurplus + 0.3596 * block_size + 0.6673 * logLineLength;
1058 if (parallelismSurplus > 1 && line_length > 64)
1063 local_ordinal_type n_subparts_per_part = 0.5 + modeled;
1065 if (parallelismSurplus < 0.3)
1066 n_subparts_per_part = 1;
1073 local_ordinal_type min_subparts_per_part = 1;
1074 local_ordinal_type max_subparts_per_part = (line_length + 2) / 3;
1076 if (max_subparts_per_part > 16)
1077 max_subparts_per_part = 16;
1078 if (n_subparts_per_part < min_subparts_per_part)
1079 n_subparts_per_part = min_subparts_per_part;
1080 if (n_subparts_per_part > max_subparts_per_part)
1081 n_subparts_per_part = max_subparts_per_part;
1082 return n_subparts_per_part;
1085template <
typename ArgActiveExecutionMemorySpace>
1086struct SolveTridiagsDefaultModeAndAlgo;
1091template <
typename MatrixType>
1092BlockHelperDetails::PartInterface<MatrixType>
1093createPartInterface(
const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1094 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1095 const Teuchos::Array<Teuchos::Array<
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type>> &partitions,
1096 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1097 IFPACK2_BLOCKHELPER_TIMER(
"createPartInterface", createPartInterface);
1098 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1099 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1100 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1101 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
1102 using size_type =
typename impl_type::size_type;
1104 auto bA = Teuchos::rcp_dynamic_cast<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1106 TEUCHOS_ASSERT(!bA.is_null() || G->getLocalNumRows() != 0);
1107 const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1108 constexpr int vector_length = impl_type::vector_length;
1109 constexpr int internal_vector_length = impl_type::internal_vector_length;
1111 const auto comm = A->getRowMap()->getComm();
1113 BlockHelperDetails::PartInterface<MatrixType> interf;
1115 const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1116 const bool jacobi = partitions.size() == 0 || partitions.size() == A_n_lclrows;
1117 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1119 typedef std::pair<local_ordinal_type, local_ordinal_type> size_idx_pair_type;
1120 std::vector<size_idx_pair_type> partsz(nparts);
1123 for (local_ordinal_type i = 0; i < nparts; ++i)
1124 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1125 std::sort(partsz.begin(), partsz.end(),
1126 [](
const size_idx_pair_type &x,
const size_idx_pair_type &y) {
1127 return x.first > y.first;
1131 local_ordinal_type n_subparts_per_part;
1133 n_subparts_per_part = 1;
1135 if (n_subparts_per_part_in == -1) {
1138 using execution_space =
typename impl_type::execution_space;
1141 if constexpr (impl_type::node_type::is_gpu) {
1142 const int line_length = partsz[0].first;
1144 const local_ordinal_type team_size =
1145 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1146 recommended_team_size(blocksize, vector_length, internal_vector_length);
1148 const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1149 n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1150#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1151 printf(
"Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1154 n_subparts_per_part = 1;
1155#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1156 printf(
"Automatically chosen n_subparts_per_part = 1 for CPU backend\n");
1160 n_subparts_per_part = n_subparts_per_part_in;
1165 const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1168 const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part - 1);
1170#if defined(BLOCKTRIDICONTAINER_DEBUG)
1171 local_ordinal_type nrows = 0;
1175 for (local_ordinal_type i = 0; i < nparts; ++i) nrows += partitions[i].size();
1177 TEUCHOS_TEST_FOR_EXCEPT_MSG(nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) <<
"The #rows implied by the local partition is not "
1178 <<
"the same as getLocalNumRows: " << nrows <<
" vs " << A_n_lclrows);
1182 std::vector<local_ordinal_type> p;
1184 interf.max_partsz = 1;
1185 interf.max_subpartsz = 0;
1186 interf.n_subparts_per_part = 1;
1187 interf.nparts = nparts;
1192 for (local_ordinal_type i = 0; i < nparts; ++i)
1193 p[i] = partsz[i].second;
1195 interf.max_partsz = partsz[0].first;
1197 constexpr local_ordinal_type connection_length = 2;
1198 const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1199 const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1201 interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1202 interf.n_subparts_per_part = n_subparts_per_part;
1203 interf.nparts = nparts;
1207 interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag(
"partptr"), nparts + 1);
1208 interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag(
"lclrow"), A_n_lclrows);
1209 interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag(
"part2rowidx0"), nparts + 1);
1210 interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag(
"part2packrowidx0"), nparts + 1);
1211 interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag(
"rowidx2part"), A_n_lclrows);
1213 interf.part2rowidx0_sub = local_ordinal_type_1d_view(do_not_initialize_tag(
"part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1214 interf.part2packrowidx0_sub = local_ordinal_type_2d_view(do_not_initialize_tag(
"part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1215 interf.rowidx2part_sub = local_ordinal_type_1d_view(do_not_initialize_tag(
"rowidx2part"), A_n_lclrows);
1217 interf.partptr_sub = local_ordinal_type_2d_view(do_not_initialize_tag(
"partptr_sub"), n_sub_parts_and_schur, 2);
1220 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1221 const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1223 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1224 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1225 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1226 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1228 const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1229 const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1230 const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1233 interf.row_contiguous =
true;
1235 part2rowidx0(0) = 0;
1236 part2packrowidx0(0) = 0;
1237 local_ordinal_type pack_nrows = 0;
1238 local_ordinal_type pack_nrows_sub = 0;
1240 IFPACK2_BLOCKHELPER_TIMER(
"compute part indices (Jacobi)", Jacobi);
1244 for (local_ordinal_type i = 0; i <= nparts; ++i) {
1245 part2rowidx0(i) = i;
1248 for (local_ordinal_type i = 0; i < nparts; ++i) {
1252 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1254 if (ip % vector_length == 0) pack_nrows = 1;
1255 part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1257 part2rowidx0_sub(0) = 0;
1258 partptr_sub(0, 0) = 0;
1260 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1261 constexpr local_ordinal_type ipnrows = 1;
1262 const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1264 TEUCHOS_TEST_FOR_EXCEPTION(full_line_length != ipnrows, std::logic_error,
1265 "In the part " << ip);
1267 constexpr local_ordinal_type connection_length = 2;
1269 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1270 TEUCHOS_TEST_FOR_EXCEPTION(
true, std::logic_error,
1271 "The part " << ip <<
" is too short to use " << n_subparts_per_part <<
" sub parts.");
1273 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1274 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1276 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1278 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1279 const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1280 const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1281 if (local_sub_ip != n_subparts_per_part - 1) {
1282 if (local_sub_ip != 0) {
1283 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1284 }
else if (ip != 0) {
1285 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1287 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1288 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1289 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1291 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1292 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1294#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1295 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1296 printf(
"Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1299 if (local_sub_ip != 0) {
1300 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1301 }
else if (ip != 0) {
1302 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1304 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1306 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1308#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1309 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1315#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1316 std::cout <<
"partptr_sub = " << std::endl;
1317 for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1318 for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1319 std::cout << partptr_sub(i, j) <<
" ";
1321 std::cout << std::endl;
1323 std::cout <<
"partptr_sub end" << std::endl;
1327 local_ordinal_type npacks = ceil(
float(nparts) / vector_length);
1329 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1330 for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1331 part2packrowidx0_sub(ip, 0) = 0;
1333 for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1335 local_ordinal_type ip_min = ipack * vector_length;
1336 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1337 for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1338 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1342 for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1343 local_ordinal_type ip_min = ipack * vector_length;
1344 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1346 const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1348 constexpr local_ordinal_type connection_length = 2;
1350 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1351 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1353 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1354 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1355 if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1357 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1359 for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1360 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1365 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1367 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1369 IFPACK2_BLOCKHELPER_TIMER(
"compute part indices", indices);
1370 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1371 const auto *part = &partitions[p[ip]];
1372 const local_ordinal_type ipnrows = part->size();
1373 TEUCHOS_ASSERT(ip == 0 || (ipnrows <=
static_cast<local_ordinal_type
>(partitions[p[ip - 1]].size())));
1374 TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1375 BlockHelperDetails::get_msg_prefix(comm)
1376 <<
"partition " << p[ip]
1377 <<
" is empty, which is not allowed.");
1379 part2rowidx0(ip + 1) = part2rowidx0(ip) + ipnrows;
1382 if (ip % vector_length == 0) pack_nrows = ipnrows;
1383 part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1384 const local_ordinal_type offset = partptr(ip);
1385 for (local_ordinal_type i = 0; i < ipnrows; ++i) {
1386 const auto lcl_row = (*part)[i];
1387 TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1388 BlockHelperDetails::get_msg_prefix(comm)
1389 <<
"partitions[" << p[ip] <<
"]["
1390 << i <<
"] = " << lcl_row
1391 <<
" but input matrix implies limits of [0, " << A_n_lclrows - 1
1393 lclrow(offset + i) = lcl_row;
1394 rowidx2part(offset + i) = ip;
1395 if (interf.row_contiguous && offset + i > 0 && lclrow((offset + i) - 1) + 1 != lcl_row)
1396 interf.row_contiguous =
false;
1398 partptr(ip + 1) = offset + ipnrows;
1400#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1401 printf(
"Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1402 printf(
"partptr(%d+1) = %d\n", ip, partptr(ip + 1));
1406 part2rowidx0_sub(0) = 0;
1407 partptr_sub(0, 0) = 0;
1410 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1411 const auto *part = &partitions[p[ip]];
1412 const local_ordinal_type ipnrows = part->size();
1413 const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1415 TEUCHOS_TEST_FOR_EXCEPTION(full_line_length != ipnrows, std::logic_error,
1416 "In the part " << ip);
1418 constexpr local_ordinal_type connection_length = 2;
1420 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1421 TEUCHOS_TEST_FOR_EXCEPTION(
true, std::logic_error,
1422 "The part " << ip <<
" is too short to use " << n_subparts_per_part <<
" sub parts.");
1424 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1425 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1427 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1429 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1430 const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1431 const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1432 if (local_sub_ip != n_subparts_per_part - 1) {
1433 if (local_sub_ip != 0) {
1434 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1435 }
else if (ip != 0) {
1436 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1438 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1439 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1440 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1442 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1443 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1445#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1446 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1447 printf(
"Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1450 if (local_sub_ip != 0) {
1451 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1452 }
else if (ip != 0) {
1453 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1455 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1457 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1459#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1460 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1467 local_ordinal_type npacks = ceil(
float(nparts) / vector_length);
1469 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1470 for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1471 part2packrowidx0_sub(ip, 0) = 0;
1473 for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1475 local_ordinal_type ip_min = ipack * vector_length;
1476 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1477 for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1478 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1482 for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1483 local_ordinal_type ip_min = ipack * vector_length;
1484 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1486 const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1488 constexpr local_ordinal_type connection_length = 2;
1490 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1491 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1493 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1494 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1495 if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1497 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1499 for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1500 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1505 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1507 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1509#if defined(BLOCKTRIDICONTAINER_DEBUG)
1510 TEUCHOS_ASSERT(partptr(nparts) == nrows);
1512 if (lclrow(0) != 0) interf.row_contiguous =
false;
1514 Kokkos::deep_copy(interf.partptr, partptr);
1515 Kokkos::deep_copy(interf.lclrow, lclrow);
1517 Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1520 interf.part2rowidx0 = interf.partptr;
1521 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1523 interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1524 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1527 IFPACK2_BLOCKHELPER_TIMER(
"Fill packptr", packptr0);
1528 local_ordinal_type npacks = ceil(
float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1530 for (local_ordinal_type ip = 1; ip <= nparts; ++ip)
1531 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1534 interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag(
"packptr"), npacks + 1);
1535 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1537 for (local_ordinal_type ip = 1, k = 1; ip <= nparts; ++ip)
1538 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1541 Kokkos::deep_copy(interf.packptr, packptr);
1543 local_ordinal_type npacks_per_subpart = ceil(
float(nparts) / vector_length);
1544 npacks = ceil(
float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1546 interf.packindices_sub = local_ordinal_type_1d_view(do_not_initialize_tag(
"packindices_sub"), npacks_per_subpart * n_subparts_per_part);
1547 interf.packindices_schur = local_ordinal_type_2d_view(do_not_initialize_tag(
"packindices_schur"), npacks_per_subpart, n_subparts_per_part - 1);
1549 const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1550 const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1553 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part - 1; ++local_sub_ip) {
1554 for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1555 packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1556 packindices_schur(local_pack_ip, local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1560 for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1561 packindices_sub((n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip;
1564#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1565 std::cout <<
"packindices_sub = " << std::endl;
1566 for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1567 std::cout << packindices_sub(i) <<
" ";
1569 std::cout << std::endl;
1570 std::cout <<
"packindices_sub end" << std::endl;
1572 std::cout <<
"packindices_schur = " << std::endl;
1573 for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1574 for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1575 std::cout << packindices_schur(i, j) <<
" ";
1577 std::cout << std::endl;
1580 std::cout <<
"packindices_schur end" << std::endl;
1583 Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1584 Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1586 interf.packptr_sub = local_ordinal_type_1d_view(do_not_initialize_tag(
"packptr"), npacks + 1);
1587 const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1589 for (local_ordinal_type k = 0; k < npacks + 1; ++k)
1590 packptr_sub(k) = packptr(k % npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1592 Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1593 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1595 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1603template <
typename MatrixType>
1606 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1608 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1609 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1610 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
1611 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
1612 using internal_vector_type_3d_view =
typename impl_type::internal_vector_type_3d_view;
1618 size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1621 local_ordinal_type_1d_view A_colindsub;
1624 vector_type_3d_view values;
1627 vector_type_3d_view values_schur;
1629 vector_type_4d_view e_values;
1633 internal_vector_type_3d_view X_internal_vector_values_schur;
1638 size_type_1d_view diag_offsets;
1642 btdm_scalar_type_3d_view d_inv;
1644 bool is_diagonal_only;
1650 template <
typename idx_type>
1651 static KOKKOS_FORCEINLINE_FUNCTION
1653 IndexToRow(
const idx_type &ind) {
return (ind + 1) / 3; }
1656 template <
typename idx_type>
1657 static KOKKOS_FORCEINLINE_FUNCTION
1659 RowToIndex(
const idx_type &row) {
return row > 0 ? 3 * row - 1 : 0; }
1661 template <
typename idx_type>
1662 static KOKKOS_FORCEINLINE_FUNCTION
1664 NumBlocks(
const idx_type &nrows) {
return nrows > 0 ? 3 * nrows - 2 : 0; }
1666 template <
typename idx_type>
1667 static KOKKOS_FORCEINLINE_FUNCTION
1669 NumBlocksSchur(
const idx_type &nrows) {
return nrows > 0 ? 3 * nrows + 2 : 0; }
1675template <
typename MatrixType>
1677createBlockTridiags(
const BlockHelperDetails::PartInterface<MatrixType> &interf) {
1678 IFPACK2_BLOCKHELPER_TIMER(
"createBlockTridiags", createBlockTridiags0);
1680 using execution_space =
typename impl_type::execution_space;
1681 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1682 using size_type =
typename impl_type::size_type;
1683 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1685 constexpr int vector_length = impl_type::vector_length;
1689 const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1692 btdm.flat_td_ptr = size_type_2d_view(do_not_initialize_tag(
"btdm.flat_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1693 const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part);
1694 Kokkos::parallel_scan(
1695 "createBlockTridiags::RangePolicy::flat_td_ptr",
1696 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
1697 const local_ordinal_type partidx = i / (2 * interf.n_subparts_per_part);
1698 const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1701 btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1703 if (local_subpartidx != (2 * interf.n_subparts_per_part - 1)) {
1704 const local_ordinal_type nrows = interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 1) - interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 0);
1705 if (local_subpartidx % 2 == 0)
1706 update += btdm.NumBlocks(nrows);
1708 update += btdm.NumBlocksSchur(nrows);
1712 const auto nblocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts - 1, 2 * interf.n_subparts_per_part - 1));
1713 btdm.is_diagonal_only = (
static_cast<local_ordinal_type
>(nblocks()) == ntridiags);
1717 if (vector_length == 1) {
1718 btdm.pack_td_ptr = btdm.flat_td_ptr;
1722 local_ordinal_type npacks_per_subpart = 0;
1723 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1724 Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1725 for (local_ordinal_type ip = 1; ip <= interf.nparts; ++ip)
1726 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1727 ++npacks_per_subpart;
1729 btdm.pack_td_ptr = size_type_2d_view(do_not_initialize_tag(
"btdm.pack_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1730 const Kokkos::RangePolicy<execution_space> policy(0, npacks_per_subpart);
1732 Kokkos::parallel_for(
1733 "createBlockTridiags::RangePolicy::pack_td_ptr",
1734 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1735 for (local_ordinal_type j = 0; j < 2 * interf.n_subparts_per_part; ++j) {
1736 const local_ordinal_type pack_id = (j == 2 * interf.n_subparts_per_part - 1) ? i + (j - 1) * npacks_per_subpart : i + j * npacks_per_subpart;
1737 const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id + 1) - interf.packptr_sub(pack_id);
1739 const local_ordinal_type parti = interf.packptr_sub(pack_id);
1740 const local_ordinal_type partidx = parti % interf.nparts;
1742 for (local_ordinal_type pti = 0; pti < nparts_in_pack; ++pti) {
1743 btdm.pack_td_ptr(partidx + pti, j) = btdm.flat_td_ptr(i, j);
1749 btdm.pack_td_ptr_schur = size_type_2d_view(do_not_initialize_tag(
"btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1751 const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1752 constexpr local_ordinal_type connection_length = 2;
1754 host_pack_td_ptr_schur(0, 0) = 0;
1755 for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1756 if (i % vector_length == 0) {
1758 host_pack_td_ptr_schur(i, 0) = host_pack_td_ptr_schur(i - 1, host_pack_td_ptr_schur.extent(1) - 1);
1759 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part - 1; ++j) {
1760 host_pack_td_ptr_schur(i, j + 1) = host_pack_td_ptr_schur(i, j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part - 2 ? 1 : 0);
1763 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1764 host_pack_td_ptr_schur(i, j) = host_pack_td_ptr_schur(i - 1, j);
1769 Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1771#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1772 const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1773 std::cout <<
"flat_td_ptr = " << std::endl;
1774 for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1775 for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1776 std::cout << host_flat_td_ptr(i, j) <<
" ";
1778 std::cout << std::endl;
1780 std::cout <<
"flat_td_ptr end" << std::endl;
1782 const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1784 std::cout <<
"pack_td_ptr = " << std::endl;
1785 for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1786 for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1787 std::cout << host_pack_td_ptr(i, j) <<
" ";
1789 std::cout << std::endl;
1791 std::cout <<
"pack_td_ptr end" << std::endl;
1793 std::cout <<
"pack_td_ptr_schur = " << std::endl;
1794 for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1795 for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1796 std::cout << host_pack_td_ptr_schur(i, j) <<
" ";
1798 std::cout << std::endl;
1800 std::cout <<
"pack_td_ptr_schur end" << std::endl;
1804 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1819template <
typename MatrixType>
1820void setTridiagsToIdentity(
const BlockTridiags<MatrixType> &btdm,
1821 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view &packptr) {
1822 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1823 using execution_space =
typename impl_type::execution_space;
1824 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1825 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1827 const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1828 const local_ordinal_type blocksize = btdm.values.extent(1);
1831 const int vector_length = impl_type::vector_length;
1832 const int internal_vector_length = impl_type::internal_vector_length;
1834 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
1835 using internal_vector_type =
typename impl_type::internal_vector_type;
1836 using internal_vector_type_4d_view =
1837 typename impl_type::internal_vector_type_4d_view;
1839 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1840 const internal_vector_type_4d_view values(
reinterpret_cast<internal_vector_type *
>(btdm.values.data()),
1841 btdm.values.extent(0),
1842 btdm.values.extent(1),
1843 btdm.values.extent(2),
1844 vector_length / internal_vector_length);
1845 const local_ordinal_type vector_loop_size = values.extent(3);
1846#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1847 local_ordinal_type total_team_size(0);
1849 total_team_size = 32;
1850 else if (blocksize <= 9)
1851 total_team_size = 64;
1852 else if (blocksize <= 12)
1853 total_team_size = 96;
1854 else if (blocksize <= 16)
1855 total_team_size = 128;
1856 else if (blocksize <= 20)
1857 total_team_size = 160;
1859 total_team_size = 160;
1860 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1861 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1862#elif defined(KOKKOS_ENABLE_HIP)
1867 local_ordinal_type total_team_size(0);
1869 total_team_size = 32;
1870 else if (blocksize <= 9)
1871 total_team_size = 64;
1872 else if (blocksize <= 12)
1873 total_team_size = 96;
1874 else if (blocksize <= 16)
1875 total_team_size = 128;
1876 else if (blocksize <= 20)
1877 total_team_size = 160;
1879 total_team_size = 160;
1880 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1881 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1882#elif defined(KOKKOS_ENABLE_SYCL)
1884 local_ordinal_type total_team_size(0);
1886 total_team_size = 32;
1887 else if (blocksize <= 9)
1888 total_team_size = 64;
1889 else if (blocksize <= 12)
1890 total_team_size = 96;
1891 else if (blocksize <= 16)
1892 total_team_size = 128;
1893 else if (blocksize <= 20)
1894 total_team_size = 160;
1896 total_team_size = 160;
1897 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1898 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1901 const team_policy_type policy(packptr.extent(0) - 1, 1, 1);
1903 Kokkos::parallel_for(
1904 "setTridiagsToIdentity::TeamPolicy",
1905 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
1906 const local_ordinal_type k = member.league_rank();
1907 const local_ordinal_type ibeg = pack_td_ptr(packptr(k), 0);
1908 const local_ordinal_type iend = pack_td_ptr(packptr(k), pack_td_ptr.extent(1) - 1);
1910 const local_ordinal_type diff = iend - ibeg;
1911 const local_ordinal_type icount = diff / 3 + (diff % 3 > 0);
1912 const btdm_scalar_type one(1);
1913 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
1914 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, icount), [&](
const local_ordinal_type &ii) {
1915 const local_ordinal_type i = ibeg + ii * 3;
1916 for (local_ordinal_type j = 0; j < blocksize; ++j) {
1917 values(i, j, j, v) = one;
1928template <
typename MatrixType>
1929void performSymbolicPhase(
const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1930 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1931 const BlockHelperDetails::PartInterface<MatrixType> &interf,
1932 BlockTridiags<MatrixType> &btdm,
1933 BlockHelperDetails::AmD<MatrixType> &amd,
1934 const bool overlap_communication_and_computation,
1935 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
1937 bool use_fused_jacobi) {
1938 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::SymbolicPhase", SymbolicPhase);
1940 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1942 using execution_space =
typename impl_type::execution_space;
1944 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1945 using global_ordinal_type =
typename impl_type::global_ordinal_type;
1946 using size_type =
typename impl_type::size_type;
1947 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1948 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1949 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1950 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
1951 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
1952 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
1953 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
1954 using internal_vector_type_3d_view =
typename impl_type::internal_vector_type_3d_view;
1955 using lo_traits = Tpetra::Details::OrdinalTraits<local_ordinal_type>;
1957 constexpr int vector_length = impl_type::vector_length;
1958 constexpr int internal_vector_length = impl_type::internal_vector_length;
1960 const auto comm = A->getRowMap()->getComm();
1962 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
1963 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
1965 bool hasBlockCrsMatrix = !A_bcrs.is_null();
1966 TEUCHOS_ASSERT(hasBlockCrsMatrix || g->getLocalNumRows() != 0);
1967 const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows() / g->getLocalNumRows();
1969 const auto partptr = interf.partptr;
1970 const auto lclrow = interf.lclrow;
1971 const auto rowidx2part = interf.rowidx2part;
1972 const auto part2rowidx0 = interf.part2rowidx0;
1973 const auto packptr = interf.packptr;
1976 const local_ordinal_type nrows = Kokkos::create_mirror_view_and_copy(
1977 Kokkos::HostSpace(), Kokkos::subview(partptr, partptr.extent(0) - 1))();
1979 Kokkos::View<local_ordinal_type *, execution_space> col2row(
"col2row", A->getLocalNumCols());
1983 Kokkos::deep_copy(execution_space(), col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1985 TEUCHOS_ASSERT(!(g->getRowMap().is_null() || g->getColMap().is_null() || g->getDomainMap().is_null()));
1986#if defined(BLOCKTRIDICONTAINER_DEBUG)
1989 auto rowmapHost = g->getRowMap();
1990 auto colmapHost = g->getColMap();
1991 auto dommapHost = g->getDomainMap();
1992 for (local_ordinal_type lr = 0; lr < nrows; lr++) {
1993 const global_ordinal_type gid = rowmapHost->getGlobalElement(lr);
1994 TEUCHOS_ASSERT(gid != Teuchos::OrdinalTraits<global_ordinal_type>::invalid());
1995 if (dommapHost->isNodeGlobalElement(gid)) {
1996 const local_ordinal_type lc = colmapHost->getLocalElement(gid);
1997 TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits<local_ordinal_type>::invalid(),
1998 BlockHelperDetails::get_msg_prefix(comm) <<
"GID " << gid
1999 <<
" gives an invalid local column.");
2004 auto rowmap = g->getRowMap()->getLocalMap();
2005 auto colmap = g->getColMap()->getLocalMap();
2006 auto dommap = g->getDomainMap()->getLocalMap();
2008 const Kokkos::RangePolicy<execution_space> policy(0, nrows);
2009 Kokkos::parallel_for(
2010 "performSymbolicPhase::RangePolicy::col2row",
2011 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
2012 const global_ordinal_type gid = rowmap.getGlobalElement(lr);
2013 if (dommap.getLocalElement(gid) != lo_traits::invalid()) {
2014 const local_ordinal_type lc = colmap.getLocalElement(gid);
2022 const auto local_graph = g->getLocalGraphDevice();
2023 const auto local_graph_rowptr = local_graph.row_map;
2024 TEUCHOS_ASSERT(local_graph_rowptr.size() ==
static_cast<size_t>(nrows + 1));
2025 const auto local_graph_colidx = local_graph.entries;
2029 Kokkos::View<local_ordinal_type *, execution_space> lclrow2idx(
"lclrow2idx", nrows);
2031 const Kokkos::RangePolicy<execution_space> policy(0, nrows);
2032 Kokkos::parallel_for(
2033 "performSymbolicPhase::RangePolicy::lclrow2idx",
2034 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
2035 lclrow2idx(lclrow(i)) = i;
2040 size_type D_nnz, R_nnz_owned, R_nnz_remote;
2042 const Kokkos::RangePolicy<execution_space> policy(0, nrows);
2043 Kokkos::parallel_reduce
2046 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr, size_type &update_D_nnz, size_type &update_R_nnz_owned, size_type &update_R_nnz_remote) {
2048 const local_ordinal_type ri0 = lclrow2idx(lr);
2049 const local_ordinal_type pi0 = rowidx2part(ri0);
2050 for (size_type j = local_graph_rowptr(lr); j < local_graph_rowptr(lr + 1); ++j) {
2051 const local_ordinal_type lc = local_graph_colidx(j);
2052 const local_ordinal_type lc2r = col2row(lc);
2053 bool incr_R =
false;
2055 if (lc2r == (local_ordinal_type)-1) {
2059 const local_ordinal_type ri = lclrow2idx(lc2r);
2060 const local_ordinal_type pi = rowidx2part(ri);
2068 if (ri0 + 1 >= ri && ri0 <= ri + 1)
2075 ++update_R_nnz_owned;
2077 ++update_R_nnz_remote;
2081 D_nnz, R_nnz_owned, R_nnz_remote);
2084 if (!overlap_communication_and_computation) {
2085 R_nnz_owned += R_nnz_remote;
2091 const auto flat_td_ptr = btdm.flat_td_ptr;
2093 btdm.A_colindsub = local_ordinal_type_1d_view(
"btdm.A_colindsub", D_nnz);
2094 const auto D_A_colindsub = btdm.A_colindsub;
2096#if defined(BLOCKTRIDICONTAINER_DEBUG)
2097 Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2100 const local_ordinal_type nparts = partptr.extent(0) - 1;
2103 const Kokkos::RangePolicy<execution_space> policy(0, nparts);
2104 Kokkos::parallel_for(
2105 "performSymbolicPhase::RangePolicy<execution_space>::D_graph",
2106 policy, KOKKOS_LAMBDA(
const local_ordinal_type &pi0) {
2107 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2108 local_ordinal_type offset = 0;
2109 for (local_ordinal_type ri0 = partptr(pi0); ri0 < partptr(pi0 + 1); ++ri0) {
2110 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2112 const local_ordinal_type lr0 = lclrow(ri0);
2113 const size_type j0 = local_graph_rowptr(lr0);
2114 for (size_type j = j0; j < local_graph_rowptr(lr0 + 1); ++j) {
2115 const local_ordinal_type lc = local_graph_colidx(j);
2116 const local_ordinal_type lc2r = col2row[lc];
2117 if (lc2r == (local_ordinal_type)-1)
continue;
2118 const local_ordinal_type ri = lclrow2idx[lc2r];
2119 const local_ordinal_type pi = rowidx2part(ri);
2120 if (pi != pi0)
continue;
2121 if (ri + 1 < ri0 || ri > ri0 + 1)
continue;
2122 const local_ordinal_type row_entry = j - j0;
2123 D_A_colindsub(flat_td_ptr(pi0, 0) + ((td_row_os + ri) - ri0)) = row_entry;
2128#if defined(BLOCKTRIDICONTAINER_DEBUG)
2130 auto D_A_colindsub_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), D_A_colindsub);
2131 for (
size_t i = 0; i < D_A_colindsub_host.extent(0); ++i)
2132 TEUCHOS_ASSERT(D_A_colindsub_host(i) != Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2138 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0) - 1, btdm.pack_td_ptr.extent(1) - 1);
2139 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2140 btdm.values = vector_type_3d_view(
"btdm.values", num_packed_blocks(), blocksize, blocksize);
2142 if (interf.n_subparts_per_part > 1) {
2143 const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0) - 1, btdm.pack_td_ptr_schur.extent(1) - 1);
2144 const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2145 btdm.values_schur = vector_type_3d_view(
"btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2148 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2154 amd.rowptr = size_type_1d_view(
"amd.rowptr", nrows + 1);
2155 amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag(
"amd.A_colindsub"), R_nnz_owned);
2157 const auto R_rowptr = amd.rowptr;
2158 const auto R_A_colindsub = amd.A_colindsub;
2160 amd.rowptr_remote = size_type_1d_view(
"amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2161 amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag(
"amd.A_colindsub_remote"), R_nnz_remote);
2163 const auto R_rowptr_remote = amd.rowptr_remote;
2164 const auto R_A_colindsub_remote = amd.A_colindsub_remote;
2167 const Kokkos::RangePolicy<execution_space> policy(0, nrows);
2168 Kokkos::parallel_for(
2169 "performSymbolicPhase::RangePolicy<execution_space>::R_graph_count",
2170 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
2171 const local_ordinal_type ri0 = lclrow2idx[lr];
2172 const local_ordinal_type pi0 = rowidx2part(ri0);
2173 const size_type j0 = local_graph_rowptr(lr);
2174 for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2175 const local_ordinal_type lc = local_graph_colidx(j);
2176 const local_ordinal_type lc2r = col2row[lc];
2177 if (lc2r != (local_ordinal_type)-1) {
2178 const local_ordinal_type ri = lclrow2idx[lc2r];
2179 const local_ordinal_type pi = rowidx2part(ri);
2180 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2185 if (!overlap_communication_and_computation || lc < nrows) {
2188 ++R_rowptr_remote(lr);
2197 size_type R_rowptr_final;
2198 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<execution_space>(nrows + 1, R_rowptr, R_rowptr_final);
2199 TEUCHOS_ASSERT(R_rowptr_final == R_nnz_owned);
2200 if (overlap_communication_and_computation) {
2201 size_type R_rowptr_remote_final;
2202 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<execution_space>(nrows + 1, R_rowptr_remote, R_rowptr_remote_final);
2203 TEUCHOS_ASSERT(R_rowptr_remote_final == R_nnz_remote);
2208 Kokkos::RangePolicy<execution_space> policy(0, nrows);
2209 Kokkos::parallel_for(
2210 "performSymbolicPhase::RangePolicy<execution_space>::R_graph_fill",
2211 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
2212 const local_ordinal_type ri0 = lclrow2idx[lr];
2213 const local_ordinal_type pi0 = rowidx2part(ri0);
2215 size_type cnt_rowptr = R_rowptr(lr);
2216 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0;
2218 const size_type j0 = local_graph_rowptr(lr);
2219 for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2220 const local_ordinal_type lc = local_graph_colidx(j);
2221 const local_ordinal_type lc2r = col2row[lc];
2222 if (lc2r != (local_ordinal_type)-1) {
2223 const local_ordinal_type ri = lclrow2idx[lc2r];
2224 const local_ordinal_type pi = rowidx2part(ri);
2225 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2228 const local_ordinal_type row_entry = j - j0;
2229 if (!overlap_communication_and_computation || lc < nrows)
2230 R_A_colindsub(cnt_rowptr++) = row_entry;
2232 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2238 if (hasBlockCrsMatrix)
2239 amd.tpetra_values = (
const_cast<block_crs_matrix_type *
>(A_bcrs.get())->getValuesDeviceNonConst());
2241 amd.tpetra_values = (
const_cast<crs_matrix_type *
>(A_crs.get()))->getLocalValuesDevice(Tpetra::Access::ReadWrite);
2245 if (interf.n_subparts_per_part > 1) {
2247 btdm.e_values = vector_type_4d_view(
"btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2248 btdm.X_internal_vector_values_schur = internal_vector_type_3d_view(
2249 do_not_initialize_tag(
"X_internal_vector_values_schur"),
2250 2 * (interf.n_subparts_per_part - 1) * interf.part2packrowidx0_sub.extent(0),
2252 vector_length / internal_vector_length);
2263 if (BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix) {
2264 bool is_async_importer_active = !async_importer.is_null();
2265 local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2266 bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2267 BlockHelperDetails::ComputeResidualVector<MatrixType>::precompute_A_x_offsets(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2271 if (use_fused_jacobi) {
2272 btdm.d_inv = btdm_scalar_type_3d_view(do_not_initialize_tag(
"btdm.d_inv"), interf.nparts, blocksize, blocksize);
2273 auto rowptrs = A_bcrs->getCrsGraph().getLocalRowPtrsDevice();
2274 auto entries = A_bcrs->getCrsGraph().getLocalIndicesDevice();
2275 btdm.diag_offsets = BlockHelperDetails::findDiagOffsets<execution_space, size_type_1d_view>(rowptrs, entries, interf.nparts, blocksize);
2277 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2283template <
typename ArgActiveExecutionMemorySpace>
2288 typedef KB::Mode::Serial mode_type;
2289#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2290 typedef KB::Algo::Level3::CompactMKL algo_type;
2292 typedef KB::Algo::Level3::Blocked algo_type;
2294 static int recommended_team_size(
const int ,
2301#if defined(KOKKOS_ENABLE_CUDA)
2302static inline int ExtractAndFactorizeRecommendedCudaTeamSize(
const int blksize,
2303 const int vector_length,
2304 const int internal_vector_length) {
2305 const int vector_size = vector_length / internal_vector_length;
2306 int total_team_size(0);
2308 total_team_size = 32;
2309 else if (blksize <= 9)
2310 total_team_size = 32;
2311 else if (blksize <= 12)
2312 total_team_size = 96;
2313 else if (blksize <= 16)
2314 total_team_size = 128;
2315 else if (blksize <= 20)
2316 total_team_size = 160;
2318 total_team_size = 160;
2319 return 2 * total_team_size / vector_size;
2322struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2323 typedef KB::Mode::Team mode_type;
2324 typedef KB::Algo::Level3::Unblocked algo_type;
2325 static int recommended_team_size(
const int blksize,
2326 const int vector_length,
2327 const int internal_vector_length) {
2328 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2332struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2333 typedef KB::Mode::Team mode_type;
2334 typedef KB::Algo::Level3::Unblocked algo_type;
2335 static int recommended_team_size(
const int blksize,
2336 const int vector_length,
2337 const int internal_vector_length) {
2338 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2343#if defined(KOKKOS_ENABLE_HIP)
2344static inline int ExtractAndFactorizeRecommendedHIPTeamSize(
const int blksize,
2345 const int vector_length,
2346 const int internal_vector_length) {
2347 const int vector_size = vector_length / internal_vector_length;
2348 int total_team_size(0);
2350 total_team_size = 32;
2351 else if (blksize <= 9)
2352 total_team_size = 32;
2353 else if (blksize <= 12)
2354 total_team_size = 96;
2355 else if (blksize <= 16)
2356 total_team_size = 128;
2357 else if (blksize <= 20)
2358 total_team_size = 160;
2360 total_team_size = 160;
2361 return 2 * total_team_size / vector_size;
2364struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2365 typedef KB::Mode::Team mode_type;
2366 typedef KB::Algo::Level3::Unblocked algo_type;
2367 static int recommended_team_size(
const int blksize,
2368 const int vector_length,
2369 const int internal_vector_length) {
2370 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2374struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2375 typedef KB::Mode::Team mode_type;
2376 typedef KB::Algo::Level3::Unblocked algo_type;
2377 static int recommended_team_size(
const int blksize,
2378 const int vector_length,
2379 const int internal_vector_length) {
2380 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2385#if defined(KOKKOS_ENABLE_SYCL)
2386static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(
const int blksize,
2387 const int vector_length,
2388 const int internal_vector_length) {
2389 const int vector_size = vector_length / internal_vector_length;
2390 int total_team_size(0);
2392 total_team_size = 32;
2393 else if (blksize <= 9)
2394 total_team_size = 32;
2395 else if (blksize <= 12)
2396 total_team_size = 96;
2397 else if (blksize <= 16)
2398 total_team_size = 128;
2399 else if (blksize <= 20)
2400 total_team_size = 160;
2402 total_team_size = 160;
2403 return 2 * total_team_size / vector_size;
2406struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2407 typedef KB::Mode::Team mode_type;
2408 typedef KB::Algo::Level3::Unblocked algo_type;
2409 static int recommended_team_size(
const int blksize,
2410 const int vector_length,
2411 const int internal_vector_length) {
2412 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2416struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2417 typedef KB::Mode::Team mode_type;
2418 typedef KB::Algo::Level3::Unblocked algo_type;
2419 static int recommended_team_size(
const int blksize,
2420 const int vector_length,
2421 const int internal_vector_length) {
2422 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2427template <
typename impl_type,
typename WWViewType>
2428KOKKOS_INLINE_FUNCTION
void
2429solveMultiVector(
const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2430 const typename impl_type::local_ordinal_type & ,
2431 const typename impl_type::local_ordinal_type &i0,
2432 const typename impl_type::local_ordinal_type &r0,
2433 const typename impl_type::local_ordinal_type &nrows,
2434 const typename impl_type::local_ordinal_type &v,
2435 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2436 const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2437 const WWViewType &WW,
2438 const bool skip_first_pass =
false) {
2439 using execution_space =
typename impl_type::execution_space;
2440 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2441 using member_type =
typename team_policy_type::member_type;
2442 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2444 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2446 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2447 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2449 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2452 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
2453 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
2456 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2457 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2460 local_ordinal_type i = i0, r = r0;
2464 if (skip_first_pass) {
2465 i += (nrows - 2) * 3;
2467 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2468 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2469 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2470 KB::Trsm<member_type,
2471 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2472 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2473 X1.assign_data(X2.data());
2476 KB::Trsm<member_type,
2477 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2478 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2479 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
2480 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2481 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2482 member.team_barrier();
2483 KB::Gemm<member_type,
2484 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2485 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2486 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2487 KB::Trsm<member_type,
2488 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2489 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2490 X1.assign_data(X2.data());
2495 KB::Trsm<member_type,
2496 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2497 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2498 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2500 A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
2501 X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
2502 member.team_barrier();
2503 KB::Gemm<member_type,
2504 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2505 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2507 A.assign_data(&D_internal_vector_values(i, 0, 0, v));
2508 KB::Trsm<member_type,
2509 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2510 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2511 X1.assign_data(X2.data());
2515 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2516 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
2517 member.team_barrier();
2518 KB::Gemm<member_type,
2519 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2520 default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
2524template <
typename impl_type,
typename WWViewType,
typename XViewType>
2525KOKKOS_INLINE_FUNCTION
void
2526solveSingleVectorNew(
const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2527 const typename impl_type::local_ordinal_type &blocksize,
2528 const typename impl_type::local_ordinal_type &i0,
2529 const typename impl_type::local_ordinal_type &r0,
2530 const typename impl_type::local_ordinal_type &nrows,
2531 const typename impl_type::local_ordinal_type &v,
2532 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2533 const XViewType &X_internal_vector_values,
2534 const WWViewType &WW) {
2535 using execution_space =
typename impl_type::execution_space;
2538 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2540 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2542 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2543 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2545 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2548 auto A = D_internal_vector_values.data();
2549 auto X = X_internal_vector_values.data();
2552 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
2553 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
2557 const local_ordinal_type astep = D_internal_vector_values.stride(0);
2558 const local_ordinal_type as0 = D_internal_vector_values.stride(1);
2559 const local_ordinal_type as1 = D_internal_vector_values.stride(2);
2560 const local_ordinal_type xstep = X_internal_vector_values.stride(0);
2561 const local_ordinal_type xs0 = X_internal_vector_values.stride(1);
2564 A += i0 * astep + v;
2565 X += r0 * xstep + v;
2570 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2573 blocksize, blocksize,
2578 for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
2579 member.team_barrier();
2580 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2582 blocksize, blocksize,
2584 A + 2 * astep, as0, as1,
2587 X + 1 * xstep, xs0);
2588 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2591 blocksize, blocksize,
2593 A + 3 * astep, as0, as1,
2594 X + 1 * xstep, xs0);
2601 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2604 blocksize, blocksize,
2609 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2611 member.team_barrier();
2612 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2614 blocksize, blocksize,
2616 A + 1 * astep, as0, as1,
2619 X - 1 * xstep, xs0);
2620 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2623 blocksize, blocksize,
2626 X - 1 * xstep, xs0);
2632 const local_ordinal_type ws0 = WW.stride(0);
2633 auto W = WW.data() + v;
2634 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
2635 member, blocksize, X, xs0, W, ws0);
2636 member.team_barrier();
2637 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2639 blocksize, blocksize,
2648template <
typename local_ordinal_type,
typename ViewType>
2649void writeBTDValuesToFile(
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2650#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2651 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2652 std::ofstream myfile;
2653 myfile.open(fileName);
2655 const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type)scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2656 local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2657 const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2658 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2660 const local_ordinal_type block_size = scalar_values.extent(1);
2662 const local_ordinal_type n_rows_per_part = (n_blocks_per_part + 2) / 3 * block_size;
2663 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2665 const local_ordinal_type n_packs = ceil(
float(n_parts) / n_parts_per_pack);
2667 myfile <<
"%%MatrixMarket matrix coordinate real general" << std::endl;
2668 myfile <<
"%%nnz = " << nnz;
2669 myfile <<
" block size = " << block_size;
2670 myfile <<
" number of blocks = " << n_blocks;
2671 myfile <<
" number of parts = " << n_parts;
2672 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2673 myfile <<
" number of rows = " << n_rows;
2674 myfile <<
" number of cols = " << n_rows;
2675 myfile <<
" number of packs = " << n_packs << std::endl;
2677 myfile << n_rows <<
" " << n_rows <<
" " << nnz << std::setprecision(9) << std::endl;
2679 local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2680 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2681 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2682 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2683 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2684 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2685 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2687 if (i_block_in_part % 3 == 0) {
2688 current_row_offset = i_block_in_part / 3 * block_size;
2689 current_col_offset = i_block_in_part / 3 * block_size;
2690 }
else if (i_block_in_part % 3 == 1) {
2691 current_row_offset = (i_block_in_part - 1) / 3 * block_size;
2692 current_col_offset = ((i_block_in_part - 1) / 3 + 1) * block_size;
2693 }
else if (i_block_in_part % 3 == 2) {
2694 current_row_offset = ((i_block_in_part - 2) / 3 + 1) * block_size;
2695 current_col_offset = (i_block_in_part - 2) / 3 * block_size;
2697 current_row_offset += current_part_idx * n_rows_per_part;
2698 current_col_offset += current_part_idx * n_rows_per_part;
2699 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2700 for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2701 current_row = current_row_offset + i_in_block + 1;
2702 current_col = current_col_offset + j_in_block + 1;
2703 myfile << current_row <<
" " << current_col <<
" " << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2714template <
typename local_ordinal_type,
typename ViewType>
2715void write4DMultiVectorValuesToFile(
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2716#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2717 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2718 std::ofstream myfile;
2719 myfile.open(fileName);
2721 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2722 const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2723 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2725 const local_ordinal_type block_size = scalar_values.extent(1);
2726 const local_ordinal_type n_cols = scalar_values.extent(2);
2728 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2729 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2731 const local_ordinal_type n_packs = ceil(
float(n_parts) / n_parts_per_pack);
2733 myfile <<
"%%MatrixMarket matrix array real general" << std::endl;
2734 myfile <<
"%%block size = " << block_size;
2735 myfile <<
" number of blocks = " << n_blocks;
2736 myfile <<
" number of parts = " << n_parts;
2737 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2738 myfile <<
" number of rows = " << n_rows;
2739 myfile <<
" number of cols = " << n_cols;
2740 myfile <<
" number of packs = " << n_packs << std::endl;
2742 myfile << n_rows <<
" " << n_cols << std::setprecision(9) << std::endl;
2744 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2745 (void)current_row_offset;
2746 (void)current_part_idx;
2747 for (local_ordinal_type j_in_block = 0; j_in_block < n_cols; ++j_in_block) {
2748 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2749 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2750 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2751 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2752 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2754 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2756 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2757 myfile << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2767template <
typename local_ordinal_type,
typename ViewType>
2768void write5DMultiVectorValuesToFile(
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2769#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2770 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2771 std::ofstream myfile;
2772 myfile.open(fileName);
2774 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2775 const local_ordinal_type n_blocks = scalar_values.extent(1) * n_parts_per_pack;
2776 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2778 const local_ordinal_type block_size = scalar_values.extent(2);
2779 const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2780 const local_ordinal_type n_cols = n_blocks_cols * block_size;
2782 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2783 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2785 const local_ordinal_type n_packs = ceil(
float(n_parts) / n_parts_per_pack);
2787 myfile <<
"%%MatrixMarket matrix array real general" << std::endl;
2788 myfile <<
"%%block size = " << block_size;
2789 myfile <<
" number of blocks = " << n_blocks;
2790 myfile <<
" number of parts = " << n_parts;
2791 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2792 myfile <<
" number of rows = " << n_rows;
2793 myfile <<
" number of cols = " << n_cols;
2794 myfile <<
" number of packs = " << n_packs << std::endl;
2796 myfile << n_rows <<
" " << n_cols << std::setprecision(9) << std::endl;
2798 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2799 (void)current_row_offset;
2800 (void)current_part_idx;
2801 for (local_ordinal_type i_block_col = 0; i_block_col < n_blocks_cols; ++i_block_col) {
2802 for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2803 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2804 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2805 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2806 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2807 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2809 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(1))
2811 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2812 myfile << scalar_values(i_block_col, current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2823template <
typename local_ordinal_type,
typename member_type,
typename ViewType1,
typename ViewType2>
2824KOKKOS_INLINE_FUNCTION
void
2825copy3DView(
const member_type &member,
const ViewType1 &view1,
const ViewType2 &view2) {
2838 Kokkos::Experimental::local_deep_copy(member, view1, view2);
2840template <
typename MatrixType,
int ScratchLevel>
2841struct ExtractAndFactorizeTridiags {
2843 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2845 using execution_space =
typename impl_type::execution_space;
2846 using memory_space =
typename impl_type::memory_space;
2848 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2851 using magnitude_type =
typename impl_type::magnitude_type;
2853 using row_matrix_type =
typename impl_type::tpetra_row_matrix_type;
2854 using crs_graph_type =
typename impl_type::tpetra_crs_graph_type;
2856 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
2857 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
2859 using size_type_2d_view =
typename impl_type::size_type_2d_view;
2860 using impl_scalar_type_1d_view_tpetra =
typename impl_type::impl_scalar_type_1d_view_tpetra;
2862 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
2863 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2864 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
2865 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
2866 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
2867 using internal_vector_type_5d_view =
typename impl_type::internal_vector_type_5d_view;
2868 using btdm_scalar_type_2d_view =
typename impl_type::btdm_scalar_type_2d_view;
2869 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
2870 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
2871 using btdm_scalar_type_5d_view =
typename impl_type::btdm_scalar_type_5d_view;
2872 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2873 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2874 using tpetra_block_access_view_type =
typename impl_type::tpetra_block_access_view_type;
2875 using local_crs_graph_type =
typename impl_type::local_crs_graph_type;
2876 using colinds_view =
typename local_crs_graph_type::entries_type;
2878 using internal_vector_type =
typename impl_type::internal_vector_type;
2879 static constexpr int vector_length = impl_type::vector_length;
2880 static constexpr int internal_vector_length = impl_type::internal_vector_length;
2881 static_assert(vector_length >= internal_vector_length,
"Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2882 static_assert(vector_length % internal_vector_length == 0,
"Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2887 static constexpr int half_vector_length = impl_type::half_vector_length;
2890 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2891 using member_type =
typename team_policy_type::member_type;
2895 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2896 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2897 const local_ordinal_type max_partsz;
2899 using size_type_1d_view_tpetra = Kokkos::View<size_t *, typename impl_type::node_device_type>;
2900 ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2901 ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2902 ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2904 const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2905 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2906 const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2907 const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2908 const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2909 const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2910 const Unmanaged<btdm_scalar_type_3d_view> d_inv;
2911 const Unmanaged<size_type_1d_view> diag_offsets;
2913 const local_ordinal_type blocksize, blocksize_square;
2915 const magnitude_type tiny;
2916 const local_ordinal_type vector_loop_size;
2918 bool hasBlockCrsMatrix;
2921 ExtractAndFactorizeTridiags(
const BlockTridiags<MatrixType> &btdm_,
2922 const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2923 const Teuchos::RCP<const row_matrix_type> &A_,
2924 const Teuchos::RCP<const crs_graph_type> &G_,
2925 const magnitude_type &tiny_)
2927 partptr(interf_.partptr)
2928 , lclrow(interf_.lclrow)
2929 , packptr(interf_.packptr)
2930 , packindices_sub(interf_.packindices_sub)
2931 , packptr_sub(interf_.packptr_sub)
2932 , partptr_sub(interf_.partptr_sub)
2933 , part2packrowidx0_sub(interf_.part2packrowidx0_sub)
2934 , packindices_schur(interf_.packindices_schur)
2935 , max_partsz(interf_.max_partsz)
2938 pack_td_ptr(btdm_.pack_td_ptr)
2939 , flat_td_ptr(btdm_.flat_td_ptr)
2940 , pack_td_ptr_schur(btdm_.pack_td_ptr_schur)
2941 , A_colindsub(btdm_.A_colindsub)
2942 , internal_vector_values((internal_vector_type *)btdm_.values.data(),
2943 btdm_.values.extent(0),
2944 btdm_.values.extent(1),
2945 btdm_.values.extent(2),
2946 vector_length / internal_vector_length)
2947 , internal_vector_values_schur((internal_vector_type *)btdm_.values_schur.data(),
2948 btdm_.values_schur.extent(0),
2949 btdm_.values_schur.extent(1),
2950 btdm_.values_schur.extent(2),
2951 vector_length / internal_vector_length)
2952 , e_internal_vector_values((internal_vector_type *)btdm_.e_values.data(),
2953 btdm_.e_values.extent(0),
2954 btdm_.e_values.extent(1),
2955 btdm_.e_values.extent(2),
2956 btdm_.e_values.extent(3),
2957 vector_length / internal_vector_length)
2958 , scalar_values((btdm_scalar_type *)btdm_.values.data(),
2959 btdm_.values.extent(0),
2960 btdm_.values.extent(1),
2961 btdm_.values.extent(2),
2963 , scalar_values_schur((btdm_scalar_type *)btdm_.values_schur.data(),
2964 btdm_.values_schur.extent(0),
2965 btdm_.values_schur.extent(1),
2966 btdm_.values_schur.extent(2),
2968 , e_scalar_values((btdm_scalar_type *)btdm_.e_values.data(),
2969 btdm_.e_values.extent(0),
2970 btdm_.e_values.extent(1),
2971 btdm_.e_values.extent(2),
2972 btdm_.e_values.extent(3),
2974 , d_inv(btdm_.d_inv)
2975 , diag_offsets(btdm_.diag_offsets)
2976 , blocksize(btdm_.values.extent(1))
2977 , blocksize_square(blocksize * blocksize)
2981 , vector_loop_size(vector_length / internal_vector_length) {
2982 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
2983 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
2985 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_);
2986 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A_);
2988 hasBlockCrsMatrix = !A_bcrs.is_null();
2990 A_block_rowptr = G_->getLocalGraphDevice().row_map;
2991 if (hasBlockCrsMatrix) {
2992 A_values =
const_cast<block_crs_matrix_type *
>(A_bcrs.get())->getValuesDeviceNonConst();
2994 A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
2995 A_values = A_crs->getLocalValuesDevice(Tpetra::Access::ReadOnly);
3000 KOKKOS_INLINE_FUNCTION
3002 extract(local_ordinal_type partidx,
3003 local_ordinal_type local_subpartidx,
3004 local_ordinal_type npacks)
const {
3005#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3006 printf(
"extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
3008 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3009 const size_type kps = pack_td_ptr(partidx, local_subpartidx);
3010 local_ordinal_type kfs[vector_length] = {};
3011 local_ordinal_type ri0[vector_length] = {};
3012 local_ordinal_type nrows[vector_length] = {};
3014 for (local_ordinal_type vi = 0; vi < npacks; ++vi, ++partidx) {
3015 kfs[vi] = flat_td_ptr(partidx, local_subpartidx);
3016 ri0[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 0);
3017 nrows[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 1) - ri0[vi];
3018#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3019 printf(
"kfs[%d] = %d;\n", vi, kfs[vi]);
3020 printf(
"ri0[%d] = %d;\n", vi, ri0[vi]);
3021 printf(
"nrows[%d] = %d;\n", vi, nrows[vi]);
3024 local_ordinal_type tr_min = 0;
3025 local_ordinal_type tr_max = nrows[0];
3026 if (local_subpartidx % 2 == 1) {
3030#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3031 printf(
"tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3033 for (local_ordinal_type tr = tr_min, j = 0; tr < tr_max; ++tr) {
3034 for (local_ordinal_type e = 0; e < 3; ++e) {
3035 if (hasBlockCrsMatrix) {
3036 const impl_scalar_type *block[vector_length] = {};
3037 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3038 const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
3040 block[vi] = &A_values(Aj * blocksize_square);
3042 const size_type pi = kps + j;
3043#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3044 printf(
"Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
3047 for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
3048 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3049 const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
3050 auto &v = internal_vector_values(pi, ii, jj, 0);
3051 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3052 v[vi] =
static_cast<btdm_scalar_type
>(block[vi][idx]);
3057 const size_type pi = kps + j;
3059 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
3060 const size_type Aj_c = A_colindsub(kfs[vi] + j);
3062 for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
3063 auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr) * blocksize + ii);
3065 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3066 scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c * blocksize + jj);
3072 if (nrows[0] == 1)
break;
3073 if (local_subpartidx % 2 == 0) {
3074 if (e == 1 && (tr == 0 || tr + 1 == nrows[0]))
break;
3075 for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3076 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr + 1 == nrows[vi])) {
3082 if (e == 0 && (tr == -1 || tr == nrows[0]))
break;
3083 for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3084 if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3094 KOKKOS_INLINE_FUNCTION
3096 extract(
const member_type &member,
3097 const local_ordinal_type &partidxbeg,
3098 local_ordinal_type local_subpartidx,
3099 const local_ordinal_type &npacks,
3100 const local_ordinal_type &vbeg)
const {
3101#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3102 printf(
"extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3104 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3105 local_ordinal_type kfs_vals[internal_vector_length] = {};
3106 local_ordinal_type ri0_vals[internal_vector_length] = {};
3107 local_ordinal_type nrows_vals[internal_vector_length] = {};
3109 const size_type kps = pack_td_ptr(partidxbeg, local_subpartidx);
3110 for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3111 kfs_vals[vi] = flat_td_ptr(partidxbeg + vi, local_subpartidx);
3112 ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 0);
3113 nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 1) - ri0_vals[vi];
3114#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3115 printf(
"kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3116 printf(
"ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3117 printf(
"nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3121 local_ordinal_type j_vals[internal_vector_length] = {};
3123 local_ordinal_type tr_min = 0;
3124 local_ordinal_type tr_max = nrows_vals[0];
3125 if (local_subpartidx % 2 == 1) {
3129#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3130 printf(
"tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3132 for (local_ordinal_type tr = tr_min; tr < tr_max; ++tr) {
3133 for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3134 const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3135 if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows + 1)) {
3136 auto &j = j_vals[vi];
3137 const local_ordinal_type kfs = kfs_vals[vi];
3138 const local_ordinal_type ri0 = ri0_vals[vi];
3139 local_ordinal_type lbeg, lend;
3140 if (local_subpartidx % 2 == 0) {
3141 lbeg = (tr == tr_min ? 1 : 0);
3142 lend = (tr == nrows - 1 ? 2 : 3);
3149 }
else if (tr == nrows) {
3154 if (hasBlockCrsMatrix) {
3155 for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3156 const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3157 const impl_scalar_type *block = &A_values(Aj * blocksize_square);
3158 const size_type pi = kps + j;
3159#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3160 printf(
"Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3162 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3163 [&](
const local_ordinal_type &ii) {
3164 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3165 scalar_values(pi, ii, jj, v) =
static_cast<btdm_scalar_type
>(block[tlb::getFlatIndex(ii, jj, blocksize)]);
3170 for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3171 const size_type Aj_c = A_colindsub(kfs + j);
3172 const size_type pi = kps + j;
3173 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3174 [&](
const local_ordinal_type &ii) {
3175 auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr) * blocksize + ii);
3176 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3177 scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c * blocksize + jj);
3187 template <
typename AAViewType,
3188 typename WWViewType>
3189 KOKKOS_INLINE_FUNCTION
void
3190 factorize_subline(
const member_type &member,
3191 const local_ordinal_type &i0,
3192 const local_ordinal_type &nrows,
3193 const local_ordinal_type &v,
3194 const AAViewType &AA,
3195 const WWViewType &WW)
const {
3196 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3198 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3199 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3202 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3204#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3205 printf(
"i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3209 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3211 default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3216 local_ordinal_type i = i0;
3217 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
3218#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3219 printf(
"tr = %d, i = %d;\n", tr, i);
3221 B.assign_data(&AA(i + 1, 0, 0, v));
3222 KB::Trsm<member_type,
3223 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3224 default_mode_type, default_algo_type>::invoke(member, one, A, B);
3225 C.assign_data(&AA(i + 2, 0, 0, v));
3226 KB::Trsm<member_type,
3227 KB::Side::Right, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3228 default_mode_type, default_algo_type>::invoke(member, one, A, C);
3229 A.assign_data(&AA(i + 3, 0, 0, v));
3231 member.team_barrier();
3232 KB::Gemm<member_type,
3233 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3234 default_mode_type, default_algo_type>::invoke(member, -one, C, B, one, A);
3236 default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3240 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3241 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, A, W);
3242 KB::SetIdentity<member_type, default_mode_type>::invoke(member, A);
3243 member.team_barrier();
3244 KB::Trsm<member_type,
3245 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3246 default_mode_type, default_algo_type>::invoke(member, one, W, A);
3247 KB::Trsm<member_type,
3248 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3249 default_mode_type, default_algo_type>::invoke(member, one, W, A);
3254 struct ExtractAndFactorizeSubLineTag {};
3255 struct ExtractAndFactorizeFusedJacobiTag {};
3256 struct ExtractBCDTag {};
3257 struct ComputeETag {};
3258 struct ComputeSchurTag {};
3259 struct FactorizeSchurTag {};
3261 KOKKOS_INLINE_FUNCTION
3263 operator()(
const ExtractAndFactorizeSubLineTag &,
const member_type &member)
const {
3265 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3267 const local_ordinal_type subpartidx = packptr_sub(packidx);
3268 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3269 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3270 const local_ordinal_type partidx = subpartidx % n_parts;
3272 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3273 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3274 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3276 internal_vector_scratch_type_3d_view
3277 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3279#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3280 printf(
"rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3281 printf(
"vector_loop_size = %d\n", vector_loop_size);
3284 if (vector_loop_size == 1) {
3285 extract(partidx, local_subpartidx, npacks);
3286 factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3288 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3289 [&](
const local_ordinal_type &v) {
3290 const local_ordinal_type vbeg = v * internal_vector_length;
3291#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3292 printf(
"i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3295 extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3298 member.team_barrier();
3299 factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3304 KOKKOS_INLINE_FUNCTION
3306 operator()(
const ExtractAndFactorizeFusedJacobiTag &,
const member_type &member)
const {
3307 using default_mode_and_algo_type = ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>;
3308 using default_mode_type =
typename default_mode_and_algo_type::mode_type;
3309 using default_algo_type =
typename default_mode_and_algo_type::algo_type;
3312 btdm_scalar_scratch_type_3d_view WW1(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3313 btdm_scalar_scratch_type_3d_view WW2(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3314 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3315 const local_ordinal_type nrows = lclrow.extent(0);
3316 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, half_vector_length),
3317 [&](
const local_ordinal_type &v) {
3318 local_ordinal_type row = member.league_rank() * half_vector_length + v;
3320 auto W1 = Kokkos::subview(WW1, v, Kokkos::ALL(), Kokkos::ALL());
3321 auto W2 = Kokkos::subview(WW2, v, Kokkos::ALL(), Kokkos::ALL());
3324 const impl_scalar_type *A_diag = A_values.data() + diag_offsets(row);
3327 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3329 W1.data()[i] = A_diag[i];
3332 KB::SetIdentity<member_type, default_mode_type>::invoke(member, W2);
3337 KB::SetIdentity<member_type, default_mode_type>::invoke(member, W1);
3339 member.team_barrier();
3341 KB::LU<member_type, default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, W1, tiny);
3342 member.team_barrier();
3343 KB::Trsm<member_type,
3344 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3345 default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3346 KB::Trsm<member_type,
3347 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3348 default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3349 member.team_barrier();
3351 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3353 auto d_inv_block = &d_inv(row, 0, 0);
3354 d_inv_block[i] = W2.data()[i];
3360 KOKKOS_INLINE_FUNCTION
3362 operator()(
const ExtractBCDTag &,
const member_type &member)
const {
3364 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3365 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3366 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3368 const local_ordinal_type subpartidx = packptr_sub(packidx);
3369 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3370 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3371 const local_ordinal_type partidx = subpartidx % n_parts;
3373 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3377 if (vector_loop_size == 1) {
3378 extract(partidx, local_subpartidx, npacks);
3380 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3381 [&](
const local_ordinal_type &v) {
3382 const local_ordinal_type vbeg = v * internal_vector_length;
3383#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3384 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3385 printf(
"i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3388 extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3392 member.team_barrier();
3394 const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3395 const size_type kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 1;
3397 const local_ordinal_type r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3398 const local_ordinal_type r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3400#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3401 printf(
"Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3405 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3406 Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3408 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3409 Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3412 KOKKOS_INLINE_FUNCTION
3414 operator()(
const ComputeETag &,
const member_type &member)
const {
3416 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3418 const local_ordinal_type subpartidx = packptr_sub(packidx);
3419 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3420 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3421 const local_ordinal_type partidx = subpartidx % n_parts;
3423 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3424 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3425 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
3426 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3427 const local_ordinal_type num_vectors = blocksize;
3431 internal_vector_scratch_type_3d_view
3432 WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3433 if (local_subpartidx == 0) {
3434 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
3435 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW,
true);
3437 }
else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
3438 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
3439 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3442 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
3443 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW,
true);
3444 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3449 KOKKOS_INLINE_FUNCTION
3451 operator()(
const ComputeSchurTag &,
const member_type &member)
const {
3453 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3454 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3455 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3457 const local_ordinal_type subpartidx = packptr_sub(packidx);
3458 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3459 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3460 const local_ordinal_type partidx = subpartidx % n_parts;
3463 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3469 const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
3470 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
3471 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
3473 for (local_ordinal_type i = 0; i < 4; ++i) {
3474 copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3475 Kokkos::subview(internal_vector_values, i0_offset + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3478 member.team_barrier();
3480 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
3482 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx) + 1;
3483 const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 2;
3485 const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3486 const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3488 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3490 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3491 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3493 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
3494 for (size_type i = 0; i < pack_td_ptr_schur(partidx, local_subpartidx_schur + 1) - pack_td_ptr_schur(partidx, local_subpartidx_schur); ++i) {
3495 local_ordinal_type e_r, e_c, c_kps;
3497 if (local_subpartidx_schur == 0) {
3502 }
else if (i == 3) {
3506 }
else if (i == 4) {
3518 }
else if (i == 1) {
3522 }
else if (i == 4) {
3526 }
else if (i == 5) {
3535 auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx, local_subpartidx_schur) + i, Kokkos::ALL(), Kokkos::ALL(), v);
3536 auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3537 auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3538 KB::Gemm<member_type,
3539 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3540 default_mode_type, default_algo_type>::invoke(member, -one, C, E, one, S);
3545 KOKKOS_INLINE_FUNCTION
3547 operator()(
const FactorizeSchurTag &,
const member_type &member)
const {
3548 const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3550 const local_ordinal_type subpartidx = packptr_sub(packidx);
3552 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3553 const local_ordinal_type partidx = subpartidx % n_parts;
3555 const local_ordinal_type i0 = pack_td_ptr_schur(partidx, 0);
3556 const local_ordinal_type nrows = 2 * (pack_td_ptr_schur.extent(1) - 1);
3558 internal_vector_scratch_type_3d_view
3559 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3561#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3562 printf(
"FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3565 if (vector_loop_size == 1) {
3566 factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3568 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3569 [&](
const local_ordinal_type &v) {
3570 factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3576 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3577 const local_ordinal_type team_size =
3578 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3579 recommended_team_size(blocksize, vector_length, internal_vector_length);
3580 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3581 shmem_size(blocksize, blocksize, vector_loop_size);
3584#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3585 printf(
"Start ExtractAndFactorizeSubLineTag\n");
3587 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3588 Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeSubLineTag>
3589 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3591 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3592 writeBTDValuesToFile(n_parts, scalar_values,
"before.mm");
3594 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3595 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3597 execution_space().fence();
3599 writeBTDValuesToFile(n_parts, scalar_values,
"after.mm");
3600#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3601 printf(
"End ExtractAndFactorizeSubLineTag\n");
3605 if (packindices_schur.extent(1) > 0) {
3607#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3608 printf(
"Start ExtractBCDTag\n");
3610 Kokkos::deep_copy(e_scalar_values, KokkosKernels::ArithTraits<btdm_magnitude_type>::zero());
3611 Kokkos::deep_copy(scalar_values_schur, KokkosKernels::ArithTraits<btdm_magnitude_type>::zero());
3613 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_before_extract.mm");
3616 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3617 Kokkos::TeamPolicy<execution_space, ExtractBCDTag>
3618 policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3620 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3621 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3623 execution_space().fence();
3626#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3627 printf(
"End ExtractBCDTag\n");
3629 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values,
"after_extraction_of_BCD.mm");
3630#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3631 printf(
"Start ComputeETag\n");
3633 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_after_extract.mm");
3635 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3636 Kokkos::TeamPolicy<execution_space, ComputeETag>
3637 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3639 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3640 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3642 execution_space().fence();
3644 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_after_compute.mm");
3646#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3647 printf(
"End ComputeETag\n");
3652#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3653 printf(
"Start ComputeSchurTag\n");
3655 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3656 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"before_schur.mm");
3657 Kokkos::TeamPolicy<execution_space, ComputeSchurTag>
3658 policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3660 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3662 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"after_schur.mm");
3663 execution_space().fence();
3664#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3665 printf(
"End ComputeSchurTag\n");
3670#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3671 printf(
"Start FactorizeSchurTag\n");
3673 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3674 Kokkos::TeamPolicy<execution_space, FactorizeSchurTag>
3675 policy(packindices_schur.extent(0), team_size, vector_loop_size);
3676 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3677 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3679 execution_space().fence();
3680 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"after_factor_schur.mm");
3681#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3682 printf(
"End FactorizeSchurTag\n");
3687 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3690 void run_fused_jacobi() {
3691 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3692 const local_ordinal_type team_size =
3693 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3694 recommended_team_size(blocksize, half_vector_length, 1);
3695 const local_ordinal_type per_team_scratch =
3696 btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * half_vector_length);
3698 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractAndFactorizeFusedJacobi", ExtractAndFactorizeFusedJacobiTag);
3699 Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeFusedJacobiTag>
3700 policy((lclrow.extent(0) + half_vector_length - 1) / half_vector_length, team_size, half_vector_length);
3702 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3703 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeFusedJacobiTag>",
3706 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3713template <
typename MatrixType>
3714void performNumericPhase(
const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
3715 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3716 const BlockHelperDetails::PartInterface<MatrixType> &interf,
3717 BlockTridiags<MatrixType> &btdm,
3718 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny,
3719 bool use_fused_jacobi) {
3720 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3721 using execution_space =
typename impl_type::execution_space;
3722 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3723 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3724 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
3726 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase", NumericPhase);
3728 int blocksize = btdm.values.extent(1);
3731 int scratch_required;
3732 if (!use_fused_jacobi) {
3734 scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3737 scratch_required = btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * impl_type::half_vector_length);
3740 int max_scratch = team_policy_type::scratch_size_max(0);
3742 if (scratch_required < max_scratch) {
3744 ExtractAndFactorizeTridiags<MatrixType, 0> function(btdm, interf, A, G, tiny);
3745 if (!use_fused_jacobi)
3748 function.run_fused_jacobi();
3751 ExtractAndFactorizeTridiags<MatrixType, 1> function(btdm, interf, A, G, tiny);
3752 if (!use_fused_jacobi)
3755 function.run_fused_jacobi();
3757 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3763template <
typename MatrixType>
3767 using execution_space =
typename impl_type::execution_space;
3768 using memory_space =
typename impl_type::memory_space;
3770 using local_ordinal_type =
typename impl_type::local_ordinal_type;
3772 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
3773 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
3774 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
3775 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
3776 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
3777 using const_impl_scalar_type_2d_view_tpetra =
typename impl_scalar_type_2d_view_tpetra::const_type;
3778 static constexpr int vector_length = impl_type::vector_length;
3780 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
3784 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3785 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3786 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3787 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3788 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3789 const local_ordinal_type blocksize;
3790 const local_ordinal_type num_vectors;
3793 vector_type_3d_view packed_multivector;
3794 const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3796 template <
typename TagType>
3797 KOKKOS_INLINE_FUNCTION
void copy_multivectors(
const local_ordinal_type &j,
3798 const local_ordinal_type &vi,
3799 const local_ordinal_type &pri,
3800 const local_ordinal_type &ri0)
const {
3801 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3802 for (local_ordinal_type i = 0; i < blocksize; ++i)
3803 packed_multivector(pri, i, col)[vi] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3808 const vector_type_3d_view &pmv)
3809 : partptr(interf.partptr)
3810 , packptr(interf.packptr)
3811 , part2packrowidx0(interf.part2packrowidx0)
3812 , part2rowidx0(interf.part2rowidx0)
3813 , lclrow(interf.lclrow)
3814 , blocksize(pmv.extent(1))
3815 , num_vectors(pmv.extent(2))
3816 , packed_multivector(pmv) {}
3819 KOKKOS_INLINE_FUNCTION
3821 operator()(
const local_ordinal_type &packidx)
const {
3822 local_ordinal_type partidx = packptr(packidx);
3823 local_ordinal_type npacks = packptr(packidx + 1) - partidx;
3824 const local_ordinal_type pri0 = part2packrowidx0(partidx);
3826 local_ordinal_type ri0[vector_length] = {};
3827 local_ordinal_type nrows[vector_length] = {};
3828 for (local_ordinal_type v = 0; v < npacks; ++v, ++partidx) {
3829 ri0[v] = part2rowidx0(partidx);
3830 nrows[v] = part2rowidx0(partidx + 1) - ri0[v];
3832 for (local_ordinal_type j = 0; j < nrows[0]; ++j) {
3833 local_ordinal_type cnt = 1;
3834 for (; cnt < npacks && j != nrows[cnt]; ++cnt)
3837 const local_ordinal_type pri = pri0 + j;
3838 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3839 for (local_ordinal_type i = 0; i < blocksize; ++i)
3840 for (local_ordinal_type v = 0; v < npacks; ++v)
3841 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize * lclrow(ri0[v] + j) + i, col));
3845 KOKKOS_INLINE_FUNCTION
3847 operator()(
const member_type &member)
const {
3848 const local_ordinal_type packidx = member.league_rank();
3849 const local_ordinal_type partidx_begin = packptr(packidx);
3850 const local_ordinal_type npacks = packptr(packidx + 1) - partidx_begin;
3851 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3852 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](
const local_ordinal_type &v) {
3853 const local_ordinal_type partidx = partidx_begin + v;
3854 const local_ordinal_type ri0 = part2rowidx0(partidx);
3855 const local_ordinal_type nrows = part2rowidx0(partidx + 1) - ri0;
3858 const local_ordinal_type pri = pri0;
3859 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
3860 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](
const local_ordinal_type &i) {
3861 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize * lclrow(ri0) + i, col));
3865 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](
const local_ordinal_type &j) {
3866 const local_ordinal_type pri = pri0 + j;
3867 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3868 for (local_ordinal_type i = 0; i < blocksize; ++i)
3869 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3875 void run(
const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3876 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3877 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3879 scalar_multivector = scalar_multivector_;
3880 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3881 const local_ordinal_type vl = vector_length;
3882 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3883 Kokkos::parallel_for(
"MultiVectorConverter::TeamPolicy", policy, *
this);
3885 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3886 Kokkos::parallel_for(
"MultiVectorConverter::RangePolicy", policy, *
this);
3888 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3889 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3898struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3899 typedef KB::Mode::Serial mode_type;
3900 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3901#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3902 typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3904 typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3906 static int recommended_team_size(
const int ,
3913#if defined(KOKKOS_ENABLE_CUDA)
3914static inline int SolveTridiagsRecommendedCudaTeamSize(
const int blksize,
3915 const int vector_length,
3916 const int internal_vector_length) {
3917 const int vector_size = vector_length / internal_vector_length;
3918 int total_team_size(0);
3920 total_team_size = 32;
3921 else if (blksize <= 9)
3922 total_team_size = 32;
3923 else if (blksize <= 12)
3924 total_team_size = 96;
3925 else if (blksize <= 16)
3926 total_team_size = 128;
3927 else if (blksize <= 20)
3928 total_team_size = 160;
3930 total_team_size = 160;
3931 return total_team_size / vector_size;
3935struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3936 typedef KB::Mode::Team mode_type;
3937 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3938 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3939 static int recommended_team_size(
const int blksize,
3940 const int vector_length,
3941 const int internal_vector_length) {
3942 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3946struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3947 typedef KB::Mode::Team mode_type;
3948 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3949 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3950 static int recommended_team_size(
const int blksize,
3951 const int vector_length,
3952 const int internal_vector_length) {
3953 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3958#if defined(KOKKOS_ENABLE_HIP)
3959static inline int SolveTridiagsRecommendedHIPTeamSize(
const int blksize,
3960 const int vector_length,
3961 const int internal_vector_length) {
3962 const int vector_size = vector_length / internal_vector_length;
3963 int total_team_size(0);
3965 total_team_size = 32;
3966 else if (blksize <= 9)
3967 total_team_size = 32;
3968 else if (blksize <= 12)
3969 total_team_size = 96;
3970 else if (blksize <= 16)
3971 total_team_size = 128;
3972 else if (blksize <= 20)
3973 total_team_size = 160;
3975 total_team_size = 160;
3976 return total_team_size / vector_size;
3980struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
3981 typedef KB::Mode::Team mode_type;
3982 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3983 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3984 static int recommended_team_size(
const int blksize,
3985 const int vector_length,
3986 const int internal_vector_length) {
3987 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3991struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
3992 typedef KB::Mode::Team mode_type;
3993 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3994 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3995 static int recommended_team_size(
const int blksize,
3996 const int vector_length,
3997 const int internal_vector_length) {
3998 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
4003#if defined(KOKKOS_ENABLE_SYCL)
4004static inline int SolveTridiagsRecommendedSYCLTeamSize(
const int blksize,
4005 const int vector_length,
4006 const int internal_vector_length) {
4007 const int vector_size = vector_length / internal_vector_length;
4008 int total_team_size(0);
4010 total_team_size = 32;
4011 else if (blksize <= 9)
4012 total_team_size = 32;
4013 else if (blksize <= 12)
4014 total_team_size = 96;
4015 else if (blksize <= 16)
4016 total_team_size = 128;
4017 else if (blksize <= 20)
4018 total_team_size = 160;
4020 total_team_size = 160;
4021 return total_team_size / vector_size;
4025struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
4026 typedef KB::Mode::Team mode_type;
4027 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4028 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4029 static int recommended_team_size(
const int blksize,
4030 const int vector_length,
4031 const int internal_vector_length) {
4032 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4036struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
4037 typedef KB::Mode::Team mode_type;
4038 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4039 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4040 static int recommended_team_size(
const int blksize,
4041 const int vector_length,
4042 const int internal_vector_length) {
4043 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4048template <
typename MatrixType>
4049struct SolveTridiags {
4051 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4052 using execution_space =
typename impl_type::execution_space;
4054 using local_ordinal_type =
typename impl_type::local_ordinal_type;
4057 using magnitude_type =
typename impl_type::magnitude_type;
4058 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
4059 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
4061 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
4062 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
4063 using size_type_2d_view =
typename impl_type::size_type_2d_view;
4065 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
4066 using internal_vector_type_3d_view =
typename impl_type::internal_vector_type_3d_view;
4067 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
4068 using internal_vector_type_5d_view =
typename impl_type::internal_vector_type_5d_view;
4069 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
4071 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
4073 using internal_vector_type =
typename impl_type::internal_vector_type;
4074 static constexpr int vector_length = impl_type::vector_length;
4075 static constexpr int internal_vector_length = impl_type::internal_vector_length;
4078 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
4079 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
4082 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
4083 using member_type =
typename team_policy_type::member_type;
4087 local_ordinal_type n_subparts_per_part;
4088 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
4089 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
4090 const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
4091 const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
4092 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
4093 const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
4094 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
4095 const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
4097 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
4098 const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
4101 const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
4104 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
4105 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
4106 const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
4108 const Unmanaged<internal_vector_type_3d_view> X_internal_vector_values_schur;
4110 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
4111 const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
4113 const local_ordinal_type vector_loop_size;
4116 Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
4117#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
4118 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4120 Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4122 const impl_scalar_type df;
4123 const bool compute_diff;
4127 local_ordinal_type active_schur_solve_vec;
4130 SolveTridiags(
const BlockHelperDetails::PartInterface<MatrixType> &interf,
4131 const BlockTridiags<MatrixType> &btdm,
4132 const vector_type_3d_view &pmv,
4133 const impl_scalar_type damping_factor,
4134 const bool is_norm_manager_active)
4136 n_subparts_per_part(interf.n_subparts_per_part)
4137 , partptr(interf.partptr)
4138 , packptr(interf.packptr)
4139 , packindices_sub(interf.packindices_sub)
4140 , packindices_schur(interf.packindices_schur)
4141 , part2packrowidx0(interf.part2packrowidx0)
4142 , part2packrowidx0_sub(interf.part2packrowidx0_sub)
4143 , lclrow(interf.lclrow)
4144 , packptr_sub(interf.packptr_sub)
4145 , partptr_sub(interf.partptr_sub)
4146 , pack_td_ptr_schur(btdm.pack_td_ptr_schur)
4149 pack_td_ptr(btdm.pack_td_ptr)
4150 , D_internal_vector_values((internal_vector_type *)btdm.values.data(),
4151 btdm.values.extent(0),
4152 btdm.values.extent(1),
4153 btdm.values.extent(2),
4154 vector_length / internal_vector_length)
4155 , X_internal_vector_values((internal_vector_type *)pmv.data(),
4159 vector_length / internal_vector_length)
4160 , X_internal_scalar_values((btdm_scalar_type *)pmv.data(),
4165 , X_internal_vector_values_schur(btdm.X_internal_vector_values_schur)
4166 , D_internal_vector_values_schur((internal_vector_type *)btdm.values_schur.data(),
4167 btdm.values_schur.extent(0),
4168 btdm.values_schur.extent(1),
4169 btdm.values_schur.extent(2),
4170 vector_length / internal_vector_length)
4171 , e_internal_vector_values((internal_vector_type *)btdm.e_values.data(),
4172 btdm.e_values.extent(0),
4173 btdm.e_values.extent(1),
4174 btdm.e_values.extent(2),
4175 btdm.e_values.extent(3),
4176 vector_length / internal_vector_length)
4177 , vector_loop_size(vector_length / internal_vector_length)
4178 , Y_scalar_multivector()
4180 , df(damping_factor)
4181 , compute_diff(is_norm_manager_active)
4182 , active_schur_solve_vec(0) {}
4186 KOKKOS_INLINE_FUNCTION
4188 copyToFlatMultiVector(
const member_type &member,
4189 const local_ordinal_type partidxbeg,
4190 const local_ordinal_type npacks,
4191 const local_ordinal_type pri0,
4192 const local_ordinal_type v,
4193 const local_ordinal_type blocksize,
4194 const local_ordinal_type num_vectors)
const {
4195 const local_ordinal_type vbeg = v * internal_vector_length;
4196 if (vbeg < npacks) {
4197 local_ordinal_type ri0_vals[internal_vector_length] = {};
4198 local_ordinal_type nrows_vals[internal_vector_length] = {};
4199 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4200 const local_ordinal_type partidx = partidxbeg + vv;
4201 ri0_vals[vi] = partptr(partidx);
4202 nrows_vals[vi] = partptr(partidx + 1) - ri0_vals[vi];
4205 impl_scalar_type z_partial_sum(0);
4206 if (nrows_vals[0] == 1) {
4207 const local_ordinal_type j = 0, pri = pri0;
4209 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4210 const local_ordinal_type ri0 = ri0_vals[vi];
4211 const local_ordinal_type nrows = nrows_vals[vi];
4213 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
4214 [&](
const local_ordinal_type &i) {
4215 const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4216 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4217 impl_scalar_type &y = Y_scalar_multivector(row, col);
4218 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4222 const auto yd_abs = KokkosKernels::ArithTraits<impl_scalar_type>::abs(yd);
4223 z_partial_sum += yd_abs * yd_abs;
4231 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows_vals[0]),
4232 [&](
const local_ordinal_type &j) {
4233 const local_ordinal_type pri = pri0 + j;
4234 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4235 const local_ordinal_type ri0 = ri0_vals[vi];
4236 const local_ordinal_type nrows = nrows_vals[vi];
4238 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4239 for (local_ordinal_type i = 0; i < blocksize; ++i) {
4240 const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4241 impl_scalar_type &y = Y_scalar_multivector(row, col);
4242 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4246 const auto yd_abs = KokkosKernels::ArithTraits<impl_scalar_type>::abs(yd);
4247 z_partial_sum += yd_abs * yd_abs;
4256 Z_scalar_vector(member.league_rank()) += z_partial_sum;
4263 template <
typename WWViewType>
4264 KOKKOS_INLINE_FUNCTION
void
4265 solveSingleVector(
const member_type &member,
4266 const local_ordinal_type &blocksize,
4267 const local_ordinal_type &i0,
4268 const local_ordinal_type &r0,
4269 const local_ordinal_type &nrows,
4270 const local_ordinal_type &v,
4271 const WWViewType &WW)
const {
4272 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4274 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4275 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4278 auto A = D_internal_vector_values.data();
4279 auto X = X_internal_vector_values.data();
4282 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4283 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
4287 const local_ordinal_type astep = D_internal_vector_values.stride(0);
4288 const local_ordinal_type as0 = D_internal_vector_values.stride(1);
4289 const local_ordinal_type as1 = D_internal_vector_values.stride(2);
4290 const local_ordinal_type xstep = X_internal_vector_values.stride(0);
4291 const local_ordinal_type xs0 = X_internal_vector_values.stride(1);
4294 A += i0 * astep + v;
4295 X += r0 * xstep + v;
4300 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4303 blocksize, blocksize,
4308 for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
4309 member.team_barrier();
4310 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4312 blocksize, blocksize,
4314 A + 2 * astep, as0, as1,
4317 X + 1 * xstep, xs0);
4318 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4321 blocksize, blocksize,
4323 A + 3 * astep, as0, as1,
4324 X + 1 * xstep, xs0);
4331 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4334 blocksize, blocksize,
4339 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4341 member.team_barrier();
4342 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4344 blocksize, blocksize,
4346 A + 1 * astep, as0, as1,
4349 X - 1 * xstep, xs0);
4350 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4353 blocksize, blocksize,
4356 X - 1 * xstep, xs0);
4362 const local_ordinal_type ws0 = WW.stride(0);
4363 auto W = WW.data() + v;
4364 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
4365 member, blocksize, X, xs0, W, ws0);
4366 member.team_barrier();
4367 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4369 blocksize, blocksize,
4378 template <
typename WWViewType>
4379 KOKKOS_INLINE_FUNCTION
void
4380 solveMultiVector(
const member_type &member,
4381 const local_ordinal_type & ,
4382 const local_ordinal_type &i0,
4383 const local_ordinal_type &r0,
4384 const local_ordinal_type &nrows,
4385 const local_ordinal_type &v,
4386 const WWViewType &WW)
const {
4387 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4389 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4390 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4393 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4394 const auto zero = KokkosKernels::ArithTraits<btdm_magnitude_type>::zero();
4397 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4398 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4401 local_ordinal_type i = i0, r = r0;
4405 KB::Trsm<member_type,
4406 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4407 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4408 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
4409 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
4410 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
4411 member.team_barrier();
4412 KB::Gemm<member_type,
4413 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4414 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4415 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
4416 KB::Trsm<member_type,
4417 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4418 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4419 X1.assign_data(X2.data());
4423 KB::Trsm<member_type,
4424 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4425 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4426 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4428 A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
4429 X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
4430 member.team_barrier();
4431 KB::Gemm<member_type,
4432 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4433 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4435 A.assign_data(&D_internal_vector_values(i, 0, 0, v));
4436 KB::Trsm<member_type,
4437 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4438 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4439 X1.assign_data(X2.data());
4443 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4444 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
4445 member.team_barrier();
4446 KB::Gemm<member_type,
4447 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4448 default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
4453 struct SingleVectorTag {};
4455 struct MultiVectorTag {};
4458 struct SingleVectorSubLineTag {};
4460 struct SingleVectorApplyCTag {};
4462 struct SingleVectorSchurTag {};
4464 struct SingleVectorApplyETag {};
4466 struct CopyVectorToFlatTag {};
4468 struct SingleZeroingTag {};
4471 KOKKOS_INLINE_FUNCTION
void
4472 operator()(
const SingleVectorTag<B> &,
const member_type &member)
const {
4473 const local_ordinal_type packidx = member.league_rank();
4474 const local_ordinal_type partidx = packptr(packidx);
4475 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4476 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4477 const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4478 const local_ordinal_type r0 = part2packrowidx0(partidx);
4479 const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4480 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4481 const local_ordinal_type num_vectors = 1;
4482 internal_vector_scratch_type_3d_view
4483 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4484 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4485 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4487 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4488 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4489 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4494 KOKKOS_INLINE_FUNCTION
void
4495 operator()(
const MultiVectorTag<B> &,
const member_type &member)
const {
4496 const local_ordinal_type packidx = member.league_rank();
4497 const local_ordinal_type partidx = packptr(packidx);
4498 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4499 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4500 const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4501 const local_ordinal_type r0 = part2packrowidx0(partidx);
4502 const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4503 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4504 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4506 internal_vector_scratch_type_3d_view
4507 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4508 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4509 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4511 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4512 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4513 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4518 KOKKOS_INLINE_FUNCTION
void
4519 operator()(
const SingleVectorSubLineTag<B> &,
const member_type &member)
const {
4521 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4523 const local_ordinal_type subpartidx = packptr_sub(packidx);
4524 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4525 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4526 const local_ordinal_type partidx = subpartidx % n_parts;
4528 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
4529 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4530 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4531 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4532 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4538 internal_vector_scratch_type_3d_view
4539 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4541 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4542 auto X_internal_vec = Kokkos::subview(X_internal_vector_values, Kokkos::ALL(), Kokkos::ALL(), active_schur_solve_vec, Kokkos::ALL());
4543 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vec, WW);
4548 KOKKOS_INLINE_FUNCTION
void
4549 operator()(
const SingleVectorApplyCTag<B> &,
const member_type &member)
const {
4552 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4554 const local_ordinal_type subpartidx = packptr_sub(packidx);
4555 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4556 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4557 const local_ordinal_type partidx = subpartidx % n_parts;
4558 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4561 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4562 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4563 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4567 const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
4568 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
4569 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
4574 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4576 const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx) - 2 : 0;
4577 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx + 1) + 1;
4579 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4581 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4582 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4584 if (local_subpartidx == 0) {
4585 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4586 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4587 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4588 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4590 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4592 blocksize, blocksize,
4594 C.data(), C.stride(0), C.stride(1),
4595 v_1.data(), v_1.stride(0),
4597 v_2.data(), v_2.stride(0));
4599 }
else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4600 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4601 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), active_schur_solve_vec, v);
4602 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4603 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4605 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4607 blocksize, blocksize,
4609 C.data(), C.stride(0), C.stride(1),
4610 v_1.data(), v_1.stride(0),
4612 v_2.data(), v_2.stride(0));
4615 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4617 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4618 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4619 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4621 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4623 blocksize, blocksize,
4625 C.data(), C.stride(0), C.stride(1),
4626 v_1.data(), v_1.stride(0),
4628 v_2.data(), v_2.stride(0));
4631 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), active_schur_solve_vec, v);
4632 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4633 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4635 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4637 blocksize, blocksize,
4639 C.data(), C.stride(0), C.stride(1),
4640 v_1.data(), v_1.stride(0),
4642 v_2.data(), v_2.stride(0));
4649 KOKKOS_INLINE_FUNCTION
void
4650 operator()(
const SingleVectorSchurTag<B> &,
const member_type &member)
const {
4651 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4653 const local_ordinal_type partidx = packptr_sub(packidx);
4655 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4657 const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx, 0);
4658 const local_ordinal_type nrows = 2 * (n_subparts_per_part - 1);
4660 const local_ordinal_type r0_schur = nrows * member.league_rank();
4662 internal_vector_scratch_type_3d_view
4663 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4665 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4666 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4667 for (local_ordinal_type i = 0; i < 2; ++i) {
4668 copy3DView<local_ordinal_type>(member,
4669 Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL()),
4670 Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), active_schur_solve_vec, Kokkos::ALL()));
4674 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4675 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4678 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4679 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4680 for (local_ordinal_type i = 0; i < 2; ++i) {
4681 copy3DView<local_ordinal_type>(member,
4682 Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), active_schur_solve_vec, Kokkos::ALL()),
4683 Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL()));
4689 KOKKOS_INLINE_FUNCTION
void
4690 operator()(
const SingleVectorApplyETag<B> &,
const member_type &member)
const {
4691 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4693 const local_ordinal_type subpartidx = packptr_sub(packidx);
4694 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4695 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4696 const local_ordinal_type partidx = subpartidx % n_parts;
4697 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4699 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4700 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4704 const auto one = KokkosKernels::ArithTraits<btdm_magnitude_type>::one();
4706 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4708 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4709 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4711 if (local_subpartidx == 0) {
4712 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4713 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4715 for (local_ordinal_type row = 0; row < nrows; ++row) {
4716 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4717 auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4719 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4721 blocksize, blocksize,
4723 E.data(), E.stride(0), E.stride(1),
4724 v_2.data(), v_2.stride(0),
4726 v_1.data(), v_1.stride(0));
4729 }
else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4730 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4731 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4733 for (local_ordinal_type row = 0; row < nrows; ++row) {
4734 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4735 auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4737 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4739 blocksize, blocksize,
4741 E.data(), E.stride(0), E.stride(1),
4742 v_2.data(), v_2.stride(0),
4744 v_1.data(), v_1.stride(0));
4748 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4750 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), active_schur_solve_vec, v);
4752 for (local_ordinal_type row = 0; row < nrows; ++row) {
4753 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4754 auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4756 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4758 blocksize, blocksize,
4760 E.data(), E.stride(0), E.stride(1),
4761 v_2.data(), v_2.stride(0),
4763 v_1.data(), v_1.stride(0));
4767 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), active_schur_solve_vec, v);
4769 for (local_ordinal_type row = 0; row < nrows; ++row) {
4770 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), active_schur_solve_vec, v);
4771 auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4773 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4775 blocksize, blocksize,
4777 E.data(), E.stride(0), E.stride(1),
4778 v_2.data(), v_2.stride(0),
4780 v_1.data(), v_1.stride(0));
4788 KOKKOS_INLINE_FUNCTION
void
4789 operator()(
const CopyVectorToFlatTag<B> &,
const member_type &member)
const {
4790 const local_ordinal_type packidx = member.league_rank();
4791 const local_ordinal_type partidx = packptr(packidx);
4792 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4793 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4794 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4795 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4797 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4798 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4803 KOKKOS_INLINE_FUNCTION
void
4804 operator()(
const SingleZeroingTag<B> &,
const member_type &member)
const {
4805 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4806 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4810 void run(
const impl_scalar_type_2d_view_tpetra &Y,
4811 const impl_scalar_type_1d_view &Z) {
4812 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4813 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::SolveTridiags", SolveTridiags);
4816 this->Y_scalar_multivector = Y;
4817 this->Z_scalar_vector = Z;
4819 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4820 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4822 const local_ordinal_type team_size =
4823 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4824 recommended_team_size(blocksize, vector_length, internal_vector_length);
4825 const int per_team_scratch = internal_vector_scratch_type_3d_view ::shmem_size(blocksize, num_vectors, vector_loop_size);
4827#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4828 if (packindices_schur.extent(1) <= 0) { \
4829 if (num_vectors == 1) { \
4830 Kokkos::TeamPolicy<execution_space, SingleVectorTag<B>> \
4831 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4832 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4833 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4836 Kokkos::TeamPolicy<execution_space, MultiVectorTag<B>> \
4837 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4838 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4839 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<MultiVector>", \
4844 Kokkos::TeamPolicy<execution_space, SingleZeroingTag<B>> \
4845 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4846 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4849 for (local_ordinal_type vec = 0; vec < num_vectors; vec++) { \
4850 this->active_schur_solve_vec = vec; \
4852 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4853 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4854 Kokkos::TeamPolicy<execution_space, SingleVectorSubLineTag<B>> \
4855 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4856 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4857 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4859 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4860 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4863 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4864 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4865 Kokkos::TeamPolicy<execution_space, SingleVectorApplyCTag<B>> \
4866 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4867 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4869 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4870 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4873 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4874 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4875 Kokkos::TeamPolicy<execution_space, SingleVectorSchurTag<B>> \
4876 policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4877 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4878 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4880 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4881 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4884 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4885 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4886 Kokkos::TeamPolicy<execution_space, SingleVectorApplyETag<B>> \
4887 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4888 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4890 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4891 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4895 Kokkos::TeamPolicy<execution_space, CopyVectorToFlatTag<B>> \
4896 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4897 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<CopyVectorToFlatTag>", \
4902 switch (blocksize) {
4903 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(3);
4904 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(5);
4905 case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(6);
4906 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(7);
4907 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4908 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4909 case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4910 case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4911 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4912 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4913 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4914 case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4915 default: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(0);
4917#undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
4919 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
4920 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
4927template <
typename MatrixType>
4928int applyInverseJacobi(
4929 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
4930 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
4931 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
4932 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
4933 const bool overlap_communication_and_computation,
4935 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,
4936 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,
4937 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z,
4938 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,
4940 const BlockHelperDetails::PartInterface<MatrixType> &interf,
4941 const BlockTridiags<MatrixType> &btdm,
4942 const BlockHelperDetails::AmD<MatrixType> &amd,
4943 typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work,
4944 BlockHelperDetails::NormManager<MatrixType> &norm_manager,
4948 const int max_num_sweeps,
4949 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
4950 const int check_tol_every) {
4951 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
4953 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4954 using node_memory_space =
typename impl_type::node_memory_space;
4955 using local_ordinal_type =
typename impl_type::local_ordinal_type;
4956 using size_type =
typename impl_type::size_type;
4957 using impl_scalar_type =
typename impl_type::impl_scalar_type;
4958 using magnitude_type =
typename impl_type::magnitude_type;
4959 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
4960 using vector_type_1d_view =
typename impl_type::vector_type_1d_view;
4961 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
4962 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
4964 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
4967 TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
4968 "Neither Tpetra importer nor Async importer is null.");
4970 TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
4971 "Maximum number of sweeps must be >= 1.");
4974 const bool is_seq_method_requested = !tpetra_importer.is_null();
4975 const bool is_async_importer_active = !async_importer.is_null();
4976 const bool is_norm_manager_active = tol > KokkosKernels::ArithTraits<magnitude_type>::zero();
4977 const magnitude_type tolerance = tol * tol;
4978 const local_ordinal_type blocksize = btdm.values.extent(1);
4979 const local_ordinal_type num_vectors = Y.getNumVectors();
4980 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4982 const impl_scalar_type zero(0.0);
4984 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4985 "The seq method for applyInverseJacobi, "
4986 <<
"which in any case is for developer use only, "
4987 <<
"does not support norm-based termination.");
4988 const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4989 Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4990 TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
4991 std::invalid_argument,
4992 "The seq method for applyInverseJacobi, "
4993 <<
"which in any case is for developer use only, "
4994 <<
"only supports memory spaces accessible from host.");
4997 const size_type work_span_required = num_blockrows * num_vectors * blocksize;
4998 if (work.span() < work_span_required)
4999 work = vector_type_1d_view(
"vector workspace 1d view", work_span_required);
5002 const local_ordinal_type W_size = interf.packptr.extent(0) - 1;
5003 if (local_ordinal_type(W.extent(0)) < W_size)
5004 W = impl_scalar_type_1d_view(
"W", W_size);
5006 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5008 if (is_seq_method_requested) {
5009 if (Z.getNumVectors() != Y.getNumVectors())
5010 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors,
false);
5012 if (is_async_importer_active) {
5014 async_importer->createDataBuffer(num_vectors);
5015 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5021 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
5022 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5023 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5024 const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
5025 if (is_y_zero) Kokkos::deep_copy(YY, zero);
5027 MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
5028 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
5029 damping_factor, is_norm_manager_active);
5031 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
5033 auto A_crs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_crs_matrix_type>(A);
5034 auto A_bcrs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_block_crs_matrix_type>(A);
5036 bool hasBlockCrsMatrix = !A_bcrs.is_null();
5039 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
5041 BlockHelperDetails::ComputeResidualVector<MatrixType>
5042 compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
5043 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
5047 if (is_norm_manager_active)
5048 norm_manager.setCheckFrequency(check_tol_every);
5052 for (; sweep < max_num_sweeps; ++sweep) {
5056 multivector_converter.run(XX);
5058 if (is_seq_method_requested) {
5062 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5063 compute_residual_vector.run(YY, XX, ZZ);
5066 multivector_converter.run(YY);
5070 if (overlap_communication_and_computation || !is_async_importer_active) {
5071 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5073 compute_residual_vector.run(pmv, XX, YY, remote_multivector,
true);
5074 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5075 if (is_async_importer_active) async_importer->cancel();
5078 if (is_async_importer_active) {
5079 async_importer->syncRecv();
5081 compute_residual_vector.run(pmv, XX, YY, remote_multivector,
false);
5084 if (is_async_importer_active)
5085 async_importer->syncExchange(YY);
5086 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance))
break;
5088 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5096 solve_tridiags.run(YY, W);
5099 if (is_norm_manager_active) {
5101 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5102 if (sweep + 1 == max_num_sweeps) {
5103 norm_manager.ireduce(sweep,
true);
5104 norm_manager.checkDone(sweep + 1, tolerance,
true);
5106 norm_manager.ireduce(sweep);
5114 if (is_norm_manager_active) norm_manager.finalize();
5122template <
typename MatrixType>
5123int applyFusedBlockJacobi(
5124 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5125 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5126 const bool overlap_communication_and_computation,
5128 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,
5129 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,
5130 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,
5132 const BlockHelperDetails::PartInterface<MatrixType> &interf,
5133 const BlockTridiags<MatrixType> &btdm,
5134 const BlockHelperDetails::AmD<MatrixType> &amd,
5135 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work,
5136 BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5140 const int max_num_sweeps,
5141 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5142 const int check_tol_every) {
5143 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5144 using local_ordinal_type =
typename impl_type::local_ordinal_type;
5145 using size_type =
typename impl_type::size_type;
5146 using magnitude_type =
typename impl_type::magnitude_type;
5147 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
5148 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
5150 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::ApplyFusedBlockJacobi", ApplyFusedBlockJacobi);
5153 TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
5154 "Neither Tpetra importer nor Async importer is null.");
5156 TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
5157 "Maximum number of sweeps must be >= 1.");
5160 const bool is_async_importer_active = !async_importer.is_null();
5161 const bool is_norm_manager_active = tol > KokkosKernels::ArithTraits<magnitude_type>::zero();
5162 const magnitude_type tolerance = tol * tol;
5163 const local_ordinal_type blocksize = btdm.d_inv.extent(1);
5164 const local_ordinal_type num_vectors = Y.getNumVectors();
5165 const local_ordinal_type num_blockrows = interf.nparts;
5167 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5169 if (is_async_importer_active) {
5171 async_importer->createDataBuffer(num_vectors);
5172 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5176 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5177 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5179 const bool two_pass_residual =
5180 overlap_communication_and_computation && is_async_importer_active;
5184 TEUCHOS_TEST_FOR_EXCEPT_MSG(
5185 size_t(num_blockrows) * blocksize * num_vectors != YY.extent(0) * YY.extent(1),
5186 "Local LHS vector (YY) has total size " << YY.extent(0) <<
"x" << YY.extent(1) <<
" = " << YY.extent(0) * YY.extent(1) <<
",\n"
5187 <<
"but expected " << num_blockrows <<
"x" << blocksize <<
"x" << num_vectors <<
" = " <<
size_t(num_blockrows) * blocksize * num_vectors <<
'\n');
5188 size_type work_required = size_type(num_blockrows) * blocksize * num_vectors;
5189 if (work.extent(0) < work_required) {
5190 work = impl_scalar_type_1d_view(do_not_initialize_tag(
"flat workspace 1d view"), work_required);
5193 Unmanaged<impl_scalar_type_2d_view_tpetra> y_doublebuf(work.data(), num_blockrows * blocksize, num_vectors);
5196 if (W.extent(0) !=
size_t(num_blockrows))
5197 W = impl_scalar_type_1d_view(do_not_initialize_tag(
"W"), num_blockrows);
5199 BlockHelperDetails::ComputeResidualAndSolve<MatrixType>
5200 residualAndSolve(amd, btdm.d_inv, W, blocksize, damping_factor);
5203 if (is_norm_manager_active)
5204 norm_manager.setCheckFrequency(check_tol_every);
5209 Unmanaged<impl_scalar_type_2d_view_tpetra> y_buffers[2] = {YY, y_doublebuf};
5214 for (; sweep < max_num_sweeps; ++sweep) {
5217 residualAndSolve.run_y_zero(XX, y_buffers[1 - current_y]);
5220 if (overlap_communication_and_computation || !is_async_importer_active) {
5221 if (is_async_importer_active) async_importer->asyncSendRecv(y_buffers[current_y]);
5222 if (two_pass_residual) {
5225 residualAndSolve.run_pass1_of_2(XX, y_buffers[current_y], y_buffers[1 - current_y]);
5229 residualAndSolve.run_single_pass(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5231 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5232 if (is_async_importer_active) async_importer->cancel();
5235 if (is_async_importer_active) {
5236 async_importer->syncRecv();
5238 residualAndSolve.run_pass2_of_2(y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5241 if (is_async_importer_active)
5242 async_importer->syncExchange(y_buffers[current_y]);
5243 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance))
break;
5245 residualAndSolve.run_single_pass(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5250 if (is_norm_manager_active) {
5251 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5252 if (sweep + 1 == max_num_sweeps) {
5253 norm_manager.ireduce(sweep,
true);
5254 norm_manager.checkDone(sweep + 1, tolerance,
true);
5256 norm_manager.ireduce(sweep);
5261 current_y = 1 - current_y;
5263 if (current_y == 1) {
5265 Kokkos::deep_copy(YY, y_doublebuf);
5269 if (is_norm_manager_active) norm_manager.finalize();
5274template <
typename MatrixType>
5277 using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5281 using async_import_type = AsyncableImport<MatrixType>;
5284 Teuchos::RCP<const typename impl_type::tpetra_row_matrix_type> A;
5285 Teuchos::RCP<const typename impl_type::tpetra_crs_graph_type> blockGraph;
5286 Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
5287 Teuchos::RCP<async_import_type> async_importer;
5288 bool overlap_communication_and_computation;
5291 mutable typename impl_type::tpetra_multivector_type Z;
5292 mutable typename impl_type::impl_scalar_type_1d_view W;
5295 part_interface_type part_interface;
5300 bool use_fused_jacobi;
5303 mutable typename impl_type::vector_type_1d_view work;
5305 mutable typename impl_type::impl_scalar_type_1d_view work_flat;
Preconditioners and smoothers for Tpetra sparse matrices.
Definition Ifpack2_AdditiveSchwarz_decl.hpp:40
Definition Ifpack2_BlockHelper.hpp:381
Definition Ifpack2_BlockHelper.hpp:270
size_t size_type
Definition Ifpack2_BlockHelper.hpp:274
KokkosKernels::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition Ifpack2_BlockHelper.hpp:284
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition Ifpack2_BlockHelper.hpp:347
Definition Ifpack2_BlockHelper.hpp:479
Definition Ifpack2_BlockTriDiContainer_impl.hpp:139
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1604
Definition Ifpack2_BlockTriDiContainer_impl.hpp:2284
forward declaration
Definition Ifpack2_BlockTriDiContainer_impl.hpp:5275
Definition Ifpack2_BlockTriDiContainer_impl.hpp:3764