MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_Constraint_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// MueLu: A package for multigrid based preconditioning
4//
5// Copyright 2012 NTESS and the MueLu contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef MUELU_CONSTRAINT_DEF_HPP
11#define MUELU_CONSTRAINT_DEF_HPP
12
13#include <Xpetra_Map.hpp>
14#include <Xpetra_MultiVector.hpp>
15#include <Xpetra_Matrix.hpp>
16#include "KokkosBatched_Copy_Internal.hpp"
17#include "Teuchos_Assert.hpp"
18#include "Xpetra_MatrixFactory.hpp"
19#include "Xpetra_MatrixMatrix.hpp"
20
21#include "MueLu_Exceptions.hpp"
22#include "MueLu_ProductOperator.hpp"
24#include "MueLu_Utilities.hpp"
25#include "MueLu_Monitor.hpp"
26
27#include "KokkosBlas1_set.hpp"
28#include "KokkosBatched_QR_FormQ_TeamVector_Internal.hpp"
29#include "KokkosBatched_ApplyQ_Decl.hpp"
30#include "KokkosBatched_SetIdentity_Decl.hpp"
31#include "KokkosBatched_SetIdentity_Impl.hpp"
32#include "Kokkos_DualView.hpp"
33#include "Kokkos_Pair.hpp"
34#include "Kokkos_UnorderedMap.hpp"
35#include "KokkosBatched_QR_Decl.hpp"
36#include "KokkosBatched_QR_Serial_Impl.hpp"
37#include "KokkosBatched_QR_TeamVector_Impl.hpp"
38#include "KokkosBatched_QR_FormQ_TeamVector_Internal.hpp"
39#include "KokkosBatched_LU_Decl.hpp"
40#include "KokkosBatched_LU_Team_Impl.hpp"
41#include "KokkosBatched_Trsv_Decl.hpp"
42#include "KokkosBatched_Trsv_TeamVector_Impl.hpp"
43#include "KokkosBatched_Gemm_Decl.hpp"
44#include "KokkosBatched_Gemm_Team_Impl.hpp"
45#include "KokkosBatched_Gemv_Decl.hpp"
46#include "KokkosBatched_Gemv_Team_Impl.hpp"
47#include "KokkosBatched_Copy_Decl.hpp"
48#include "KokkosBatched_Copy_Impl.hpp"
49#include "KokkosBlas1_set.hpp"
50
51namespace MueLu {
52
53template <class LocalGraph, class LocalVector>
54class MinSpmMV {
55 private:
56 using local_ordinal_type = typename LocalVector::value_type;
57
58 LocalGraph lclGraph;
59 LocalVector lhs;
60 LocalVector rhs;
61
62 const local_ordinal_type MAX_VAL = KokkosKernels::ArithTraits<local_ordinal_type>::max();
63
64 public:
65 MinSpmMV(LocalGraph lclGraph_, LocalVector lhs_, LocalVector rhs_)
66 : lclGraph(lclGraph_)
67 , lhs(lhs_)
68 , rhs(rhs_) {}
69
70 KOKKOS_INLINE_FUNCTION
71 void init(bool& dst) {
72 dst = false;
73 }
74
75 KOKKOS_INLINE_FUNCTION
76 void join(bool& dst, const bool& src) {
77 dst = dst || src;
78 }
79
80 KOKKOS_INLINE_FUNCTION
81 void operator()(const local_ordinal_type i, bool& changed) const {
83 for (local_ordinal_type jj = lclGraph.row_map(i); jj < (local_ordinal_type)lclGraph.row_map(i + 1); ++jj) {
84 auto j = lclGraph.entries(jj);
85 val = Kokkos::min(val, lhs(j));
86 }
87 auto prev = rhs(i);
88 rhs(i) = val;
89 changed = changed || (prev != val);
90 }
91};
92
93template <class LocalGraph, class LocalVector>
94class MinSpmMVT {
95 private:
96 using local_ordinal_type = typename LocalVector::value_type;
97
98 LocalGraph lclGraph;
99 LocalVector lhs;
100 LocalVector rhs;
101
102 const local_ordinal_type MAX_VAL = KokkosKernels::ArithTraits<local_ordinal_type>::max();
103
104 public:
105 MinSpmMVT(LocalGraph lclGraph_, LocalVector lhs_, LocalVector rhs_)
106 : lclGraph(lclGraph_)
107 , lhs(lhs_)
108 , rhs(rhs_) {
109 Kokkos::deep_copy(rhs, MAX_VAL);
110 }
111
112 KOKKOS_INLINE_FUNCTION
113 void operator()(const local_ordinal_type i) const {
114 local_ordinal_type val = lhs(i);
115 for (local_ordinal_type jj = lclGraph.row_map(i); jj < lclGraph.row_map(i + 1); ++jj) {
116 auto j = lclGraph.entries(jj);
117 Kokkos::atomic_min(&rhs(j), val);
119 }
121
122template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
124 public:
125 using CrsGraph = typename Xpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node>;
126 using CrsMatrix = typename Xpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
127 using local_graph_type = typename CrsGraph::local_graph_type;
128 using local_matrix_type = typename CrsMatrix::local_matrix_type;
129 using scalar_type = typename local_matrix_type::value_type;
130 using ATS = KokkosKernels::ArithTraits<scalar_type>;
131 using magnitude_type = typename ATS::magnitudeType;
132 using magATS = KokkosKernels::ArithTraits<magnitude_type>;
133 using memory_space = typename Node::memory_space;
135 using shared_matrix = Kokkos::View<scalar_type**, typename Node::execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged>;
136 using shared_vector = Kokkos::View<scalar_type*, typename Node::execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged>;
138 BlockInverseFunctor(local_matrix_type A_, local_graph_type blocks_, LocalOrdinal maxBlocksize_, local_matrix_type invA_, Kokkos::View<bool*, memory_space> singular_)
139 : A(A_)
140 , blocks(blocks_)
141 , maxBlocksize(maxBlocksize_)
142 , invA(invA_)
143 , singular(singular_) {}
144
147 class TagApply {};
148
149 private:
150 const scalar_type zero = ATS::zero();
151 const magnitude_type mag_zero = magATS::zero();
152 const scalar_type one = ATS::one();
153
158 Kokkos::View<bool*, memory_space> singular;
160 public:
161 KOKKOS_INLINE_FUNCTION
162 void operator()(TagFindSingularBlocks, const typename Kokkos::TeamPolicy<typename Node::execution_space>::member_type& thread) const {
163 using member_type = typename Kokkos::TeamPolicy<typename Node::execution_space>::member_type;
164
165 auto blockId = thread.league_rank();
166 auto blockRow = blocks.rowConst(blockId);
167 auto blockSize = blockRow.length;
169 shared_matrix lclA(thread.team_shmem(), blockSize, blockSize);
170 shared_vector lclConst(thread.team_shmem(), blockSize);
171 shared_vector lclAConst(thread.team_shmem(), blockSize);
172
173 // Initialize lclA
174 KokkosBlas::TeamSet<member_type>::invoke(thread, zero, lclA);
175 KokkosBlas::TeamSet<member_type>::invoke(thread, one, lclConst);
176 thread.team_barrier();
177
178 // extract block from A
179 for (LocalOrdinal ii = 0; ii < blockSize; ++ii) {
180 auto i = blockRow.colidx(ii);
181 auto row = A.rowConst(i);
182 for (LocalOrdinal jj = 0; jj < row.length; ++jj) {
183 auto j = row.colidx(jj);
184 auto d = row.value(jj);
185 for (LocalOrdinal kk = 0; kk < blockSize; ++kk)
186 if (blockRow.colidx(kk) == j) {
187 lclA(ii, kk) += d;
188 break;
189 }
190 }
191 }
192
193 // lclAConst = lclA * lclConst
194 KokkosBlas::TeamGemv<member_type, KokkosBlas::Trans::NoTranspose, KokkosBlas::Algo::Gemv::Unblocked>::invoke(thread, one, lclA, lclConst, zero, lclAConst);
195 thread.team_barrier();
196
197 magnitude_type norm2 = mag_zero;
198 for (LocalOrdinal i = 0; i < blockSize; ++i) {
199 norm2 += ATS::magnitude(lclAConst(i) * lclAConst(i));
200 }
201
202 singular(blockId) = (ATS::magnitude(norm2) < ATS::epsilon());
203 }
204
205 KOKKOS_INLINE_FUNCTION
206 void operator()(TagCountSingularBlocks, const typename Kokkos::TeamPolicy<typename Node::execution_space>::member_type& thread, LocalOrdinal& numSingularBlocks) const {
207 auto blockId = thread.league_rank();
208 if (singular(blockId))
209 ++numSingularBlocks;
210 }
211
212 KOKKOS_INLINE_FUNCTION
213 void operator()(TagApply, const typename Kokkos::TeamPolicy<typename Node::execution_space>::member_type& thread) const {
214 using member_type = typename Kokkos::TeamPolicy<typename Node::execution_space>::member_type;
215
216 auto blockId = thread.league_rank();
217 auto blockRow = blocks.rowConst(blockId);
218 auto blockSize = blockRow.length;
219
220 shared_matrix lclA(thread.team_shmem(), blockSize, blockSize);
221 shared_matrix lclInvA(thread.team_shmem(), blockSize, blockSize);
222
223 const bool PseudoInverse = (!(singular.extent(0) == 0)) && singular(blockId);
224
225 // Initialize lclA
226 // If PseudoInverse, we shift the constant mode.
227 KokkosBlas::TeamSet<member_type>::invoke(thread, PseudoInverse ? one : zero, lclA);
228 thread.team_barrier();
229
230 // extract block from A
231 for (LocalOrdinal ii = 0; ii < blockSize; ++ii) {
232 auto i = blockRow.colidx(ii);
233 auto row = A.rowConst(i);
234 for (LocalOrdinal jj = 0; jj < row.length; ++jj) {
235 auto j = row.colidx(jj);
236 auto d = row.value(jj);
237 for (LocalOrdinal kk = 0; kk < blockSize; ++kk)
238 if (blockRow.colidx(kk) == j) {
239 lclA(ii, kk) += d;
240 break;
241 }
242 }
243 }
244
245 // LU
246 {
247 // LU factorization: lclA = L * U
248 KokkosBatched::TeamLU<member_type, KokkosBlas::Algo::QR::Unblocked>::invoke(thread, lclA);
249
250 // set lclInvA to identity matrix
251 KokkosBatched::TeamSetIdentity<member_type>::invoke(thread, lclInvA);
252 thread.team_barrier();
253
254 // // lclInvA = L^{-1}*lclInvA
255 for (LocalOrdinal j = 0; j < blockSize; ++j)
256 KokkosBatched::TeamVectorTrsv<member_type, KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::Unit, KokkosBatched::Algo::Trsv::Unblocked>::invoke(thread, one, lclA, Kokkos::subview(lclInvA, Kokkos::ALL(), j));
257 thread.team_barrier();
258
259 // // lclInvA = R^{-1}*lclInvA
260 for (LocalOrdinal j = 0; j < blockSize; ++j)
261 KokkosBatched::TeamVectorTrsv<member_type, KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, KokkosBatched::Algo::Trsv::Unblocked>::invoke(thread, one, lclA, Kokkos::subview(lclInvA, Kokkos::ALL(), j));
262 thread.team_barrier();
263 }
264
265 // The QR in Kokkos Kernels is broken. Once it gets fixed we can use it and remove LU.
266 //
267 // QR
268 // {
269
270 // shared_vector tau(thread.team_shmem(), blockSize);
271 // shared_vector work(thread.team_shmem(), blockSize);
272
273 // // QR factorization: lclA = Q * R
274 // KokkosBatched::TeamVectorQR<member_type, KokkosBlas::Algo::QR::Unblocked>::invoke(thread, lclA, tau, work);
275
276 // // set lclInvA to identity matrix
277 // KokkosBatched::TeamSetIdentity<member_type>::invoke(thread, lclInvA);
278 // thread.team_barrier();
279
280 // // lclInvA = Q^T*lclInvA
281 // KokkosBatched::TeamVectorApplyQ<member_type, KokkosBatched::Side::Left, KokkosBlas::Trans::Transpose, KokkosBlas::Algo::ApplyQ::Unblocked>::invoke(thread, lclA, tau, lclInvA, work);
282 // thread.team_barrier();
283
284 // // lclInvA = R^{-1}*lclInvA
285 // for (LocalOrdinal j = 0; j < blockSize; ++j)
286 // KokkosBatched::TeamVectorTrsv<member_type, KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, KokkosBatched::Algo::Trsv::Unblocked>::invoke(thread, one, lclA, Kokkos::subview(lclInvA, Kokkos::ALL(), j));
287 // thread.team_barrier();
288 // }
289
290 if (PseudoInverse) {
291 // Multiply with projection that removes constant vector
292
293 // Set up projection matrix
294 for (LocalOrdinal ii = 0; ii < blockSize; ++ii) {
295 for (LocalOrdinal kk = 0; kk < blockSize; ++kk) {
296 if (ii == kk) {
297 lclA(ii, kk) = one - one / (scalar_type)blockSize;
298 } else {
299 lclA(ii, kk) = -one / (scalar_type)blockSize;
300 }
301 }
302 }
303 // Copy lclInvA to temp
304 shared_matrix temp(thread.team_shmem(), blockSize, blockSize);
305 KokkosBatched::TeamCopy<member_type, KokkosBatched::Trans::NoTranspose>::invoke(thread, lclInvA, temp);
306 thread.team_barrier();
307
308 // lclInvA = proj * lclInvA
309 KokkosBatched::TeamGemm<member_type, KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, KokkosBatched::Algo::Gemm::Unblocked>::invoke(thread, one, lclA, temp, zero, lclInvA);
310 thread.team_barrier();
311 }
312
313 // write inverse of block to invA
314 for (LocalOrdinal ii = 0; ii < blockSize; ++ii) {
315 auto i = blockRow.colidx(ii);
316 auto row = invA.row(i);
317 for (LocalOrdinal jj = 0; jj < row.length; ++jj) {
318 auto j = row.colidx(jj);
319 for (LocalOrdinal kk = 0; kk < blockSize; ++kk)
320 if (blockRow.colidx(kk) == j) {
321 row.value(jj) = lclInvA(ii, kk);
322 break;
323 }
324 }
325 }
326 }
327
328 // amount of shared memory
329 size_t team_shmem_size(int /* team_size */) const {
330 return 3 * shared_matrix::shmem_size(maxBlocksize, maxBlocksize) + 2 * shared_vector::shmem_size(maxBlocksize);
331 }
332};
333
334template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
335const Teuchos::RCP<const Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node>> Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::getDomainMap() const {
336 return X_->getDomainMap();
337}
338
339template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
340const Teuchos::RCP<const Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node>> Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::getRangeMap() const {
341 return X_->getDomainMap();
342}
343
344template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
346#ifdef HAVE_MUELU_BELOS
347 Monitor m(*this, "PrepareLeastSquaresSolveBelos");
348
349 TEUCHOS_TEST_FOR_EXCEPTION(!detect_singular_blocks, Exceptions::RuntimeError, "The option \"emin: least squares solver type\" = \"Belos\" is currently only implemented for non-singular constraint solves");
350
351 problem_ = rcp(new Belos::LinearProblem<Scalar, MV, OP>());
352
353 std::vector<RCP<Operator>> ops = {X_, X_};
354 std::vector<Teuchos::ETransp> modes = {Teuchos::NO_TRANS, Teuchos::TRANS};
355 RCP<Operator> XXt = rcp(new ProductOperator(ops, modes));
356 auto belosXXt = rcp(new Belos::XpetraOp<Scalar, LocalOrdinal, GlobalOrdinal, Node>(XXt));
357
358 problem_->setOperator(belosXXt);
359 problem_->setLabel("LeastSquares");
360
361 auto belosList = rcp(new Teuchos::ParameterList());
362 belosList->set("Implicit Residual Scaling", "None");
363 belosList->set("Convergence Tolerance", 1e-16);
364 auto out = GetMueLuOStream();
365 belosList->set("Output Stream", out->getOStream());
366 // belosList->set("Verbosity", Belos::Errors + Belos::Warnings + Belos::StatusTestDetails);
367 // belosList->set("Output Frequency", 1);
368 // belosList->set("Output Style", Belos::Brief);
369
370 Belos::SolverFactory<Scalar, MV, OP> solverFactory;
371 solver_ = solverFactory.create("Pseudo Block CG", belosList);
372#else
373 TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "Energy minimization multigrid requires Belos to be enabled.");
374#endif
375}
376
377template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
378Teuchos::RCP<Xpetra::Matrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>>
379allocateBlockDiagonalMatrix(RCP<const Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node>> map,
380 const typename Xpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::local_graph_type blocks) {
381 using graph_type = typename Xpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::local_graph_type;
382 using matrix_type = typename Xpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_type;
383 using execution_space = typename Node::execution_space;
384
385 auto numRows = map->getLocalNumElements();
386 auto numBlocks = blocks.numRows();
387 typename graph_type::row_map_type::non_const_type rowptr("rowptr", numRows + 1);
388
389 LocalOrdinal nnz = 0;
390 Kokkos::parallel_reduce(
391 "MueLu::Constraint::allocateBlockDiagonalMatrix::1", Kokkos::RangePolicy<execution_space>(0, numBlocks), KOKKOS_LAMBDA(const LocalOrdinal blockId, LocalOrdinal& count) {
392 auto blockRow = blocks.rowConst(blockId);
393 auto blockSize = blockRow.length;
394
395 for (LocalOrdinal k = 0; k < blockSize; ++k) {
396 auto rowId = blockRow.colidx(k);
397 if ((decltype(numRows))rowId+2<numRows+1)
398 Kokkos::atomic_add(&rowptr(rowId+2), blockSize);
399 count += blockSize;
400 } }, nnz);
401
402 Kokkos::parallel_scan(
403 "MueLu::Constraint::allocateBlockDiagonalMatrix::3", Kokkos::RangePolicy<execution_space>(0, numRows), KOKKOS_LAMBDA(const LocalOrdinal rowId, LocalOrdinal& sum, const bool is_final) {
404 sum += rowptr(rowId+1);
405 if (is_final) {
406 rowptr(rowId+1) = sum;
407 } });
408
409 typename graph_type::entries_type::non_const_type indices("lclInvXXt_indices", nnz);
410
411 Kokkos::parallel_for(
412 "MueLu::Constraint::allocateBlockDiagonalMatrix::3", Kokkos::RangePolicy<execution_space>(0, numBlocks), KOKKOS_LAMBDA(const LocalOrdinal blockId) {
413 auto blockRow = blocks.rowConst(blockId);
414 auto blockSize = blockRow.length;
415
416 for (LocalOrdinal k = 0; k < blockSize; ++k) {
417 auto rowId = blockRow.colidx(k);
418 for (LocalOrdinal jj = 0; jj < blockSize; ++jj) {
419 auto j = blockRow.colidx(jj);
420 auto l = Kokkos::atomic_fetch_inc(&rowptr(rowId + 1));
421 indices(l) = j;
422 }
423 }
424 });
425
426 auto lclInvXXtGraph = graph_type(indices, rowptr);
427 typename matrix_type::values_type::non_const_type values("lclInvXXt_values", nnz);
428 auto lclInvXXt = matrix_type("lclInvXXt", numRows, values, lclInvXXtGraph);
429 return Xpetra::MatrixFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(lclInvXXt, map, map, map, map);
430}
431
432template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
433typename Xpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::local_graph_type Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::FindBlocks(RCP<const Xpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node>>& XXt) {
434 using execution_space = typename Node::execution_space;
435 using memory_space = typename Node::memory_space;
436 using range_type = Kokkos::RangePolicy<LocalOrdinal, execution_space>;
437
438 // This is a generic but less efficient implementation that discovers blocks by looping
439 // over the graph of the constraint matrix X. Using additional information, specific constraints
440 // can compute the block structure in simpler fashion.
441
442 auto numConstraints = XXt->getRowMap()->getLocalNumElements();
443
444 Kokkos::View<LocalOrdinal*, memory_space> blockIds("blockIds", numConstraints);
445 Kokkos::View<LocalOrdinal*, memory_space> blockIds2("blockIds2", numConstraints);
446
447 auto lclGraph = XXt->getLocalGraphDevice();
448
449 auto constraint_range = range_type(0, numConstraints);
450
451 // initialize blockIds with constraintIds
452 Kokkos::parallel_for(
453 "MueLu::Constraint::FindBlocks::init_blockIds", constraint_range,
454 KOKKOS_LAMBDA(const LocalOrdinal contraintId) {
455 blockIds(contraintId) = contraintId;
456 });
457 Kokkos::fence();
458
459 // loop over rows of XXt and assign min of encountered blockIds until nothing changes anymore.
460 bool changed = true;
461 bool resultsIn2 = false;
462 while (changed) {
463 changed = false;
464 if (!resultsIn2) {
465 MinSpmMV functor(lclGraph, blockIds, blockIds2);
466 Kokkos::parallel_reduce("MueLu::Constraint::FindBlocks::minSpmv1", constraint_range, functor, changed);
467 resultsIn2 = true;
468 } else {
469 MinSpmMV functor(lclGraph, blockIds2, blockIds);
470 Kokkos::parallel_reduce("MueLu::Constraint::FindBlocks::minSpmv2", constraint_range, functor, changed);
471 resultsIn2 = false;
472 }
473 }
474 if (resultsIn2)
475 Kokkos::deep_copy(blockIds, blockIds2);
476
477 // record all blockIds that are still in use
478 Kokkos::View<bool*, memory_space> blockStatus("blockStatus", numConstraints);
479 Kokkos::parallel_for(
480 "MueLu::Constraint::FindBlocks::set_blockstatus", constraint_range,
481 KOKKOS_LAMBDA(LocalOrdinal contraintId) {
482 Kokkos::atomic_store(&blockStatus(blockIds(contraintId)), true);
483 });
484 Kokkos::fence();
485
486 // renumber blockIds that are still in use consecutively
487 Kokkos::View<LocalOrdinal*, memory_space> newBlockIds("newBlockIds", numConstraints);
488 LocalOrdinal numBlocks = 0;
489 Kokkos::parallel_scan(
490 "MueLu::Constraint::FindBlocks::compute_blockIds", constraint_range,
491 KOKKOS_LAMBDA(LocalOrdinal contraintId, LocalOrdinal & blockId, const bool final) {
492 if (final)
493 newBlockIds(contraintId) = blockId;
494 if (blockStatus(contraintId))
495 ++blockId;
496 },
497 numBlocks);
498
499 // Build graph with block info
500 using graph_type = typename Xpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::local_graph_type;
501 typename graph_type::row_map_type::non_const_type rowptr("blocks_rowptr", numBlocks + 1);
502 typename graph_type::entries_type::non_const_type indices("blocks_indices", numConstraints);
503
504 Kokkos::parallel_for(
505 "MueLu::Constraint::FindBlocks::count_entries_per_block", constraint_range, KOKKOS_LAMBDA(const LocalOrdinal constraintId) {
506 auto blockId = newBlockIds(blockIds(constraintId));
507 if (blockId + 2 < numBlocks + 1)
508 Kokkos::atomic_inc(&rowptr(blockId + 2));
509 });
510 Kokkos::fence();
511
512 auto block_range = range_type(0, numBlocks);
513
514 // prefix sum
515 Kokkos::parallel_scan(
516 "MueLu::Constraint::FindBlocks::prefix_sum", block_range, KOKKOS_LAMBDA(const LocalOrdinal blockId, LocalOrdinal& sum, const bool final) {
517 sum += rowptr(blockId+1);
518 if (final) {
519 rowptr(blockId+1) = sum;
520 } });
521
522 Kokkos::parallel_for(
523 "MueLu::Constraint::FindBlocks::fill", constraint_range, KOKKOS_LAMBDA(const LocalOrdinal contraintId) {
524 auto blockId = newBlockIds(blockIds(contraintId));
525 auto offset = Kokkos::atomic_fetch_inc(&rowptr(blockId + 1));
526 indices(offset) = contraintId;
527 });
528
529 graph_type blocks(indices, rowptr);
530
531 return blocks;
532}
533
534template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
536 using memory_space = typename Node::memory_space;
537 Monitor m(*this, "PrepareLeastSquaresSolveDirect");
538
539 RCP<Xpetra::Matrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>> XXt;
540 {
541 SubMonitor m2(*this, "XXt");
542 XXt = Xpetra::MatrixMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Multiply(*X_, false, *X_, true, XXt, GetOStream(Runtime0), true, true);
543 }
544
545 auto XXtgraph = XXt->getCrsGraph();
546 auto blocks = this->FindBlocks(XXtgraph);
547 invXXt_ = allocateBlockDiagonalMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(XXt->getRowMap(), blocks);
548
549 LocalOrdinal numBlocks = blocks.numRows();
550 LocalOrdinal maxBlocksize = invXXt_->getLocalMaxNumRowEntries();
551
552 // If we pass a view of size 0 to the functor, all blocks are assumed to be non-singular.
553 Kokkos::View<bool*, memory_space> block_is_singular;
554 LocalOrdinal numSingularBlocks = 0;
555 if (detect_singular_blocks)
556 block_is_singular = Kokkos::View<bool*, memory_space>("block_is_singular", numBlocks);
557
559 functor_type functor(XXt->getLocalMatrixDevice(), blocks, maxBlocksize, invXXt_->getLocalMatrixDevice(), block_is_singular);
560
561 if (detect_singular_blocks) {
562 SubMonitor m2(*this, "singular block detection");
563 Kokkos::parallel_for("MueLu::Constraint::findSingularBlocks", Kokkos::TeamPolicy<typename Node::execution_space, typename functor_type::TagFindSingularBlocks>(numBlocks, 1), functor);
564
565 if (IsPrint(Statistics0)) {
566 Kokkos::parallel_reduce("MueLu::Constraint::countSingularBlocks", Kokkos::TeamPolicy<typename Node::execution_space, typename functor_type::TagCountSingularBlocks>(numBlocks, 1), functor, numSingularBlocks);
567 }
568 }
569
570 {
571 SubMonitor m2(*this, "inversion");
572 Kokkos::parallel_for("MueLu::Constraint::invertBlocks", Kokkos::TeamPolicy<typename Node::execution_space, typename functor_type::TagApply>(numBlocks, 1), functor);
573 }
574
575 if (IsPrint(Statistics0)) {
576 // print some stats
577
578 auto comm = invXXt_->getRowMap()->getComm();
579 GlobalOrdinal globalNumBlocks;
580 GlobalOrdinal globalNumSingularBlocks;
581 MueLu_sumAll(comm, (GlobalOrdinal)numBlocks, globalNumBlocks);
582 MueLu_sumAll(comm, (GlobalOrdinal)numSingularBlocks, globalNumSingularBlocks);
583
584 GetOStream(Statistics0) << "Least-squares problem:\n maximum block size: " << invXXt_->getGlobalMaxNumRowEntries() << "\n Number of blocks: " << globalNumBlocks << "\n Number of singular blocks: " << globalNumSingularBlocks << std::endl;
585 }
586}
587
588template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
589void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::PrepareLeastSquaresSolve(const std::string& solverType, const bool detect_singular_blocks) {
590 if (solverType == "Belos")
591 PrepareLeastSquaresSolveBelos(detect_singular_blocks);
592 else if (solverType == "direct")
593 PrepareLeastSquaresSolveDirect(detect_singular_blocks);
594 else
595 TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "solverType must be one of (Belos|direct), not \"" << solverType << "\".");
596 solverType_ = solverType;
597}
598
599template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
601 // Solve (X * X^T) * C = B
602#ifdef HAVE_MUELU_BELOS
603 problem_->setLHS(rcpFromRef(C));
604 problem_->setRHS(rcpFromRef(B));
605 TEUCHOS_ASSERT(problem_->setProblem());
606
607 solver_->setProblem(problem_);
608 solver_->solve();
609#endif
610}
611
612template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
614 // Solve (X * X^T) * C = B
615 invXXt_->apply(B, C);
616}
617
618template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
619void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::LeastSquaresSolve(const MultiVector& B, MultiVector& C) const {
620 if (solverType_ == "Belos")
621 LeastSquaresSolveBelos(B, C);
622 else if (solverType_ == "direct")
623 LeastSquaresSolveDirect(B, C);
624 else
625 TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "solverType must be one of (Belos|direct), not \"" << solverType_ << "\".");
626}
627
628template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
630 MultiVector& Projected,
631 Teuchos::ETransp mode,
632 Scalar alpha,
633 Scalar beta) const {
634 const auto one = Teuchos::ScalarTraits<Scalar>::one();
635 const auto zero = Teuchos::ScalarTraits<Scalar>::zero();
636
637 TEUCHOS_ASSERT(mode == Teuchos::NO_TRANS);
638 TEUCHOS_ASSERT(alpha == one);
639 TEUCHOS_ASSERT(beta == zero);
640
641 // Projected = P - X^T * (X * X^T)^{-1} * X * P
642 Projected = P;
643 X_->apply(P, *temp1_, Teuchos::NO_TRANS);
644 LeastSquaresSolve(*temp1_, *temp2_);
645 X_->apply(*temp2_, *temp3_, Teuchos::TRANS);
646 Projected.update(-one, *temp3_, one);
647}
648
649template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
650void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::residual(const Xpetra::MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& X,
651 const Xpetra::MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
652 Xpetra::MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& R) const {
653 const auto one = Teuchos::ScalarTraits<Scalar>::one();
654 const auto zero = Teuchos::ScalarTraits<Scalar>::zero();
655
656 apply(X, R, Teuchos::NO_TRANS, one, zero);
657 R.update(one, B, -one);
658}
659
660template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
661RCP<const Xpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node>> Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::GetPattern() const {
662 return Ppattern_;
663}
664
665template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
667 Ppattern_ = Ppattern;
668}
669
670template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
672 X_ = X;
673
674 // Allocate memory
675 temp1_ = MultiVectorFactory::Build(X_->getRangeMap(), 1);
676 temp2_ = MultiVectorFactory::Build(X_->getRangeMap(), 1);
677 temp3_ = MultiVectorFactory::Build(X_->getDomainMap(), 1);
678}
679
680template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
681RCP<Xpetra::Matrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>> Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::GetConstraintMatrix() {
682 return X_;
683}
684
685template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
687 const RCP<const CrsGraph>& pattern,
688 MultiVector& vecP) const {
689 auto lclPattern = pattern->getLocalGraphDevice();
690 auto lclPatternRowMap = pattern->getRowMap()->getLocalMap();
691 auto lclPatternColMap = pattern->getColMap()->getLocalMap();
692
693 auto lclMat = P.getLocalMatrixDevice();
694 auto lclMatRowMap = P.getRowMap()->getLocalMap();
695 auto lclMatColMap = P.getColMap()->getLocalMap();
696
697 auto lclVec = vecP.getLocalViewDevice(Tpetra::Access::OverwriteAll);
698 TEUCHOS_ASSERT(lclPattern.numRows() == (typename decltype(lclPattern)::size_type)lclMat.numRows());
699 TEUCHOS_ASSERT(lclPattern.entries.extent(0) == lclVec.extent(0));
700 Kokkos::deep_copy(lclVec, 0.);
701
702 using range_type = Kokkos::RangePolicy<LocalOrdinal, typename Node::execution_space>;
703 Kokkos::parallel_for(
704 "MueLu::Constraint::AssignMatrixEntriesToVector::filter", range_type(0, lclPattern.numRows()), KOKKOS_LAMBDA(const size_t i) {
705 auto grid = lclPatternRowMap.getGlobalElement(i);
706 auto row_mat = lclMat.rowConst(lclMatRowMap.getLocalElement(grid));
707
708 if (row_mat.length == 0)
709 return;
710
711 for (size_t jj = lclPattern.row_map(i); jj < lclPattern.row_map(i + 1); ++jj) {
712 auto clid_pattern = lclPattern.entries(jj);
713 auto cgid = lclPatternColMap.getGlobalElement(clid_pattern);
714 auto clid_mat = lclMatColMap.getLocalElement(cgid);
715 // find column index in lclMat
716 LocalOrdinal kk = 0;
717 while ((kk + 1 < row_mat.length) && (row_mat.colidx(kk) != clid_mat))
718 ++kk;
719 if (row_mat.colidx(kk) == clid_mat) {
720 lclVec(jj, 0) = row_mat.value(kk);
721 }
722 }
723 });
724}
725
726template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
728 MultiVector& vecP) const {
729 AssignMatrixEntriesToVector(P, GetPattern(), vecP);
730}
731
732template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
733RCP<Xpetra::Matrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>> Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
734 GetMatrixWithEntriesFromVector(MultiVector& vecP) const {
735 auto Ppattern = GetPattern();
736 RCP<Matrix> P = MatrixFactory::Build(Ppattern);
737 {
738 auto lclMat = P->getLocalMatrixDevice();
739 auto lclVec = vecP.getLocalViewDevice(Tpetra::Access::ReadOnly);
740 TEUCHOS_ASSERT(lclMat.values.extent(0) == lclVec.extent(0));
741 Kokkos::deep_copy(lclMat.values, Kokkos::subview(lclVec, Kokkos::ALL(), 0));
742 }
743 P->fillComplete(Ppattern->getDomainMap(), Ppattern->getRowMap());
744 return P;
745}
746
747} // namespace MueLu
748
749#endif // ifndef MUELU_CONSTRAINT_DEF_HPP
#define MueLu_sumAll(rcpComm, in, out)
MueLu::DefaultLocalOrdinal LocalOrdinal
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
typename CrsGraph::local_graph_type local_graph_type
KokkosKernels::ArithTraits< magnitude_type > magATS
BlockInverseFunctor(local_matrix_type A_, local_graph_type blocks_, LocalOrdinal maxBlocksize_, local_matrix_type invA_, Kokkos::View< bool *, memory_space > singular_)
Kokkos::View< scalar_type **, typename Node::execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged > shared_matrix
Kokkos::View< scalar_type *, typename Node::execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged > shared_vector
typename Xpetra::CrsGraph< LocalOrdinal, GlobalOrdinal, Node > CrsGraph
typename local_matrix_type::value_type scalar_type
KOKKOS_INLINE_FUNCTION void operator()(TagApply, const typename Kokkos::TeamPolicy< typename Node::execution_space >::member_type &thread) const
typename ATS::magnitudeType magnitude_type
typename Xpetra::CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > CrsMatrix
typename Node::memory_space memory_space
KOKKOS_INLINE_FUNCTION void operator()(TagFindSingularBlocks, const typename Kokkos::TeamPolicy< typename Node::execution_space >::member_type &thread) const
KokkosKernels::ArithTraits< scalar_type > ATS
Kokkos::View< bool *, memory_space > singular
KOKKOS_INLINE_FUNCTION void operator()(TagCountSingularBlocks, const typename Kokkos::TeamPolicy< typename Node::execution_space >::member_type &thread, LocalOrdinal &numSingularBlocks) const
typename CrsMatrix::local_matrix_type local_matrix_type
void LeastSquaresSolveDirect(const MultiVector &B, MultiVector &C) const
Direct solve of least-squares problem.
RCP< Matrix > GetConstraintMatrix()
virtual const RCP< const Map > getDomainMap() const
The Map associated with the domain of this operator, which must be compatible with X....
void PrepareLeastSquaresSolveDirect(bool detect_singular_blocks)
Prepare direct solution of least-squares problem.
virtual void apply(const MultiVector &P, MultiVector &Projected, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Apply constraint.
void PrepareLeastSquaresSolveBelos(bool detect_singular_blocks)
Prepare least-squares solve using Belos.
virtual CrsGraph::local_graph_type FindBlocks(RCP< const CrsGraph > &XXt)
void PrepareLeastSquaresSolve(const std::string &solverType, bool detect_singular_blocks=false)
void residual(const MultiVector &X, const MultiVector &B, MultiVector &R) const
Compute a residual R = B - (*this) * X.
void LeastSquaresSolve(const MultiVector &B, MultiVector &C) const
void SetConstraintsMatrix(RCP< Matrix > &X)
void LeastSquaresSolveBelos(const MultiVector &B, MultiVector &C) const
Perform least-squares solve using Belos.
RCP< Matrix > GetMatrixWithEntriesFromVector(MultiVector &vecP) const
RCP< const CrsGraph > GetPattern() const
void SetPattern(RCP< const CrsGraph > &Ppattern)
virtual const RCP< const Map > getRangeMap() const
The Map associated with the range of this operator, which must be compatible with Y....
void AssignMatrixEntriesToVector(const Matrix &P, const RCP< const CrsGraph > &pattern, MultiVector &vecP) const
Exception throws to report errors in the internal logical of the program.
MinSpmMV(LocalGraph lclGraph_, LocalVector lhs_, LocalVector rhs_)
typename LocalVector::value_type local_ordinal_type
const local_ordinal_type MAX_VAL
KOKKOS_INLINE_FUNCTION void operator()(const local_ordinal_type i, bool &changed) const
KOKKOS_INLINE_FUNCTION void init(bool &dst)
KOKKOS_INLINE_FUNCTION void join(bool &dst, const bool &src)
typename LocalVector::value_type local_ordinal_type
const local_ordinal_type MAX_VAL
KOKKOS_INLINE_FUNCTION void operator()(const local_ordinal_type i) const
Timer to be used in non-factories.
Takes a sequence of operators and applies their product.
Timer to be used in non-factories. Similar to Monitor, but doesn't print object description.
Namespace for MueLu classes and methods.
@ Runtime0
One-liner description of what is happening.
@ Statistics0
Print statistics that do not involve significant additional computation.
Teuchos::RCP< Xpetra::Matrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > allocateBlockDiagonalMatrix(RCP< const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > map, const typename Xpetra::CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::local_graph_type blocks)