MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// MueLu: A package for multigrid based preconditioning
4//
5// Copyright 2012 NTESS and the MueLu contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
11#define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
12
13#include "Kokkos_UnorderedMap.hpp"
14#include "Xpetra_CrsGraphFactory.hpp"
15
17
18#include "MueLu_Aggregates.hpp"
19#include "MueLu_AmalgamationInfo.hpp"
20#include "MueLu_AmalgamationFactory.hpp"
21
22#include "MueLu_MasterList.hpp"
23#include "MueLu_PerfUtils.hpp"
24#include "MueLu_Monitor.hpp"
25
26#include "Xpetra_IO.hpp"
27
28namespace MueLu {
29
30namespace { // anonymous
31
32template <class LocalOrdinal, class View>
33class ReduceMaxFunctor {
34 public:
35 ReduceMaxFunctor(View view)
36 : view_(view) {}
37
38 KOKKOS_INLINE_FUNCTION
39 void operator()(const LocalOrdinal& i, LocalOrdinal& vmax) const {
40 if (vmax < view_(i))
41 vmax = view_(i);
42 }
43
44 KOKKOS_INLINE_FUNCTION
45 void join(LocalOrdinal& dst, const LocalOrdinal& src) const {
46 if (dst < src) {
47 dst = src;
48 }
49 }
50
51 KOKKOS_INLINE_FUNCTION
52 void init(LocalOrdinal& dst) const {
53 dst = 0;
54 }
55
56 private:
57 View view_;
58};
59
60// local QR decomposition
61template <class LOType, class GOType, class SCType, class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
62class LocalQRDecompFunctor {
63 private:
64 typedef LOType LO;
65 typedef GOType GO;
66 typedef SCType SC;
67
68 typedef typename DeviceType::execution_space execution_space;
69 typedef typename KokkosKernels::ArithTraits<SC>::val_type impl_SC;
70 typedef KokkosKernels::ArithTraits<impl_SC> impl_ATS;
71 typedef typename impl_ATS::magnitudeType Magnitude;
72
73 public:
74 typedef Kokkos::View<impl_SC**, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
75 typedef Kokkos::View<impl_SC*, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
76
77 private:
78 NspType fineNS;
79 NspType coarseNS;
80 aggRowsType aggRows;
81 maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
82 agg2RowMapLOType agg2RowMapLO;
83 statusType statusAtomic;
84 rowsType rows;
85 rowsAuxType rowsAux;
86 colsAuxType colsAux;
87 valsAuxType valsAux;
89
90 public:
91 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_)
92 : fineNS(fineNS_)
93 , coarseNS(coarseNS_)
94 , aggRows(aggRows_)
95 , maxAggDofSize(maxAggDofSize_)
96 , agg2RowMapLO(agg2RowMapLO_)
97 , statusAtomic(statusAtomic_)
98 , rows(rows_)
99 , rowsAux(rowsAux_)
100 , colsAux(colsAux_)
101 , valsAux(valsAux_)
102 , doQRStep(doQRStep_) {}
103
104 KOKKOS_INLINE_FUNCTION
105 void operator()(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread, size_t& nnz) const {
106 auto agg = thread.league_rank();
107
108 // size of aggregate: number of DOFs in aggregate
109 auto aggSize = aggRows(agg + 1) - aggRows(agg);
110
111 const impl_SC one = impl_ATS::one();
112 const impl_SC two = one + one;
113 const impl_SC zero = impl_ATS::zero();
114 const auto zeroM = impl_ATS::magnitude(zero);
115
116 int m = aggSize;
117 int n = fineNS.extent(1);
118
119 // calculate row offset for coarse nullspace
120 Xpetra::global_size_t offset = agg * n;
121
122 if (doQRStep) {
123 // Extract the piece of the nullspace corresponding to the aggregate
124 shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
125 for (int j = 0; j < n; j++)
126 for (int k = 0; k < m; k++)
127 r(k, j) = fineNS(agg2RowMapLO(aggRows(agg) + k), j);
128#if 0
129 printf("A\n");
130 for (int i = 0; i < m; i++) {
131 for (int j = 0; j < n; j++)
132 printf(" %5.3lf ", r(i,j));
133 printf("\n");
134 }
135#endif
136
137 // Calculate QR decomposition (standard)
138 shared_matrix q(thread.team_shmem(), m, m); // Q
139 if (m >= n) {
140 bool isSingular = false;
141
142 // Initialize Q^T
143 auto qt = q;
144 for (int i = 0; i < m; i++) {
145 for (int j = 0; j < m; j++)
146 qt(i, j) = zero;
147 qt(i, i) = one;
148 }
149
150 for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
151 // FIXME_KOKKOS: use team
152 Magnitude s = zeroM, norm, norm_x;
153 for (int i = k + 1; i < m; i++)
154 s += pow(impl_ATS::magnitude(r(i, k)), 2);
155 norm = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
156
157 if (norm == zero) {
158 isSingular = true;
159 break;
160 }
161
162 r(k, k) -= norm * one;
163
164 norm_x = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
165 if (norm_x == zeroM) {
166 // We have a single diagonal element in the column.
167 // No reflections required. Just need to restor r(k,k).
168 r(k, k) = norm * one;
169 continue;
170 }
171
172 // FIXME_KOKKOS: use team
173 for (int i = k; i < m; i++)
174 r(i, k) /= norm_x;
175
176 // Update R(k:m,k+1:n)
177 for (int j = k + 1; j < n; j++) {
178 // FIXME_KOKKOS: use team in the loops
179 impl_SC si = zero;
180 for (int i = k; i < m; i++)
181 si += r(i, k) * r(i, j);
182 for (int i = k; i < m; i++)
183 r(i, j) -= two * si * r(i, k);
184 }
185
186 // Update Q^T (k:m,k:m)
187 for (int j = k; j < m; j++) {
188 // FIXME_KOKKOS: use team in the loops
189 impl_SC si = zero;
190 for (int i = k; i < m; i++)
191 si += r(i, k) * qt(i, j);
192 for (int i = k; i < m; i++)
193 qt(i, j) -= two * si * r(i, k);
194 }
195
196 // Fix R(k:m,k)
197 r(k, k) = norm * one;
198 for (int i = k + 1; i < m; i++)
199 r(i, k) = zero;
200 }
201
202#if 0
203 // Q = (Q^T)^T
204 for (int i = 0; i < m; i++)
205 for (int j = 0; j < i; j++) {
206 impl_SC tmp = qt(i,j);
207 qt(i,j) = qt(j,i);
208 qt(j,i) = tmp;
209 }
210#endif
211
212 // Build coarse nullspace using the upper triangular part of R
213 for (int j = 0; j < n; j++)
214 for (int k = 0; k <= j; k++)
215 coarseNS(offset + k, j) = r(k, j);
216
217 if (isSingular) {
218 statusAtomic(1) = true;
219 return;
220 }
221
222 } else {
223 // Special handling for m < n (i.e. single node aggregates in structural mechanics)
224
225 // The local QR decomposition is not possible in the "overconstrained"
226 // case (i.e. number of columns in qr > number of rowsAux), which
227 // corresponds to #DOFs in Aggregate < n. For usual problems this
228 // is only possible for single node aggregates in structural mechanics.
229 // (Similar problems may arise in discontinuous Galerkin problems...)
230 // We bypass the QR decomposition and use an identity block in the
231 // tentative prolongator for the single node aggregate and transfer the
232 // corresponding fine level null space information 1-to-1 to the coarse
233 // level null space part.
234
235 // NOTE: The resulting tentative prolongation operator has
236 // (m*DofsPerNode-n) zero columns leading to a singular
237 // coarse level operator A. To deal with that one has the following
238 // options:
239 // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
240 // false) to add some identity block to the diagonal of the zero rowsAux
241 // in the coarse level operator A, such that standard level smoothers
242 // can be used again.
243 // - Use special (projection-based) level smoothers, which can deal
244 // with singular matrices (very application specific)
245 // - Adapt the code below to avoid zero columns. However, we do not
246 // support a variable number of DOFs per node in MueLu/Xpetra which
247 // makes the implementation really hard.
248 //
249 // FIXME: do we need to check for singularity here somehow? Zero
250 // columns would be easy but linear dependency would require proper QR.
251
252 // R = extended (by adding identity rowsAux) qr
253 for (int j = 0; j < n; j++)
254 for (int k = 0; k < n; k++)
255 if (k < m)
256 coarseNS(offset + k, j) = r(k, j);
257 else
258 coarseNS(offset + k, j) = (k == j ? one : zero);
259
260 // Q = I (rectangular)
261 for (int i = 0; i < m; i++)
262 for (int j = 0; j < n; j++)
263 q(i, j) = (j == i ? one : zero);
264 }
265
266 // Process each row in the local Q factor and fill helper arrays to assemble P
267 for (int j = 0; j < m; j++) {
268 LO localRow = agg2RowMapLO(aggRows(agg) + j);
269 size_t rowStart = rowsAux(localRow);
270 size_t lnnz = 0;
271 for (int k = 0; k < n; k++) {
272 // skip zeros
273 if (q(j, k) != zero) {
274 colsAux(rowStart + lnnz) = offset + k;
275 valsAux(rowStart + lnnz) = q(j, k);
276 lnnz++;
277 }
278 }
279 rows(localRow + 1) = lnnz;
280 nnz += lnnz;
281 }
282
283#if 0
284 printf("R\n");
285 for (int i = 0; i < m; i++) {
286 for (int j = 0; j < n; j++)
287 printf(" %5.3lf ", coarseNS(i,j));
288 printf("\n");
289 }
290
291 printf("Q\n");
292 for (int i = 0; i < aggSize; i++) {
293 for (int j = 0; j < aggSize; j++)
294 printf(" %5.3lf ", q(i,j));
295 printf("\n");
296 }
297#endif
298 } else {
300 // "no-QR" option //
302 // Local Q factor is just the fine nullspace support over the current aggregate.
303 // Local R factor is the identity.
304 // TODO I have not implemented any special handling for aggregates that are too
305 // TODO small to locally support the nullspace, as is done in the standard QR
306 // TODO case above.
307
308 for (int j = 0; j < m; j++) {
309 LO localRow = agg2RowMapLO(aggRows(agg) + j);
310 size_t rowStart = rowsAux(localRow);
311 size_t lnnz = 0;
312 for (int k = 0; k < n; k++) {
313 const impl_SC qr_jk = fineNS(localRow, k);
314 // skip zeros
315 if (qr_jk != zero) {
316 colsAux(rowStart + lnnz) = offset + k;
317 valsAux(rowStart + lnnz) = qr_jk;
318 lnnz++;
319 }
320 }
321 rows(localRow + 1) = lnnz;
322 nnz += lnnz;
323 }
324
325 for (int j = 0; j < n; j++)
326 coarseNS(offset + j, j) = one;
327 }
328 }
329};
330
331} // namespace
332
333template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
335 RCP<ParameterList> validParamList = rcp(new ParameterList());
336
337#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
338 SET_VALID_ENTRY("tentative: calculate qr");
339 SET_VALID_ENTRY("tentative: build coarse coordinates");
340 SET_VALID_ENTRY("sa: keep tentative prolongator");
341#undef SET_VALID_ENTRY
342
343 validParamList->set<RCP<const FactoryBase>>("A", Teuchos::null, "Generating factory of the matrix A");
344 validParamList->set<RCP<const FactoryBase>>("Aggregates", Teuchos::null, "Generating factory of the aggregates");
345 validParamList->set<RCP<const FactoryBase>>("Nullspace", Teuchos::null, "Generating factory of the nullspace");
346 validParamList->set<RCP<const FactoryBase>>("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
347 validParamList->set<RCP<const FactoryBase>>("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
348 validParamList->set<RCP<const FactoryBase>>("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
349 validParamList->set<RCP<const FactoryBase>>("Coordinates", Teuchos::null, "Generating factory of the coordinates");
350 validParamList->set<RCP<const FactoryBase>>("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
351
352 // Make sure we don't recursively validate options for the matrixmatrix kernels
353 ParameterList norecurse;
354 norecurse.disableRecursiveValidation();
355 validParamList->set<ParameterList>("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
356
357 return validParamList;
358}
359
360template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
362 const ParameterList& pL = GetParameterList();
363 // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
364 std::string nspName = "Nullspace";
365 if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
366
367 Input(fineLevel, "A");
368 Input(fineLevel, "Aggregates");
369 Input(fineLevel, nspName);
370 Input(fineLevel, "UnAmalgamationInfo");
371 Input(fineLevel, "CoarseMap");
372 if (fineLevel.GetLevelID() == 0 &&
373 fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
374 pL.get<bool>("tentative: build coarse coordinates")) { // and we want coordinates on other levels
375 bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
376 Input(fineLevel, "Coordinates");
377 } else if (bTransferCoordinates_) {
378 Input(fineLevel, "Coordinates");
379 }
380}
381
382template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
384 return BuildP(fineLevel, coarseLevel);
385}
386
387template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
389 FactoryMonitor m(*this, "Build", coarseLevel);
390
391 typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
392 typedef Xpetra::MultiVectorFactory<coordinate_type, LO, GO, NO> RealValuedMultiVectorFactory;
393 const ParameterList& pL = GetParameterList();
394 std::string nspName = "Nullspace";
395 if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
396
397 auto A = Get<RCP<Matrix>>(fineLevel, "A");
398 auto aggregates = Get<RCP<Aggregates>>(fineLevel, "Aggregates");
399 auto amalgInfo = Get<RCP<AmalgamationInfo>>(fineLevel, "UnAmalgamationInfo");
400 auto fineNullspace = Get<RCP<MultiVector>>(fineLevel, nspName);
401 auto coarseMap = Get<RCP<const Map>>(fineLevel, "CoarseMap");
402 RCP<RealValuedMultiVector> fineCoords;
403 if (bTransferCoordinates_) {
404 fineCoords = Get<RCP<RealValuedMultiVector>>(fineLevel, "Coordinates");
405 }
406
407 RCP<Matrix> Ptentative;
408 // No coarse DoFs so we need to bail by setting Ptentattive to null and returning
409 // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize()
410 if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
411 Ptentative = Teuchos::null;
412 Set(coarseLevel, "P", Ptentative);
413 return;
414 }
415 RCP<MultiVector> coarseNullspace;
416 RCP<RealValuedMultiVector> coarseCoords;
417
418 if (bTransferCoordinates_) {
419 RCP<const Map> coarseCoordMap;
420
421 LO blkSize = 1;
422 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
423 blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
424
425 if (blkSize == 1) {
426 // Scalar system
427 // No amalgamation required, we can use the coarseMap
428 coarseCoordMap = coarseMap;
429 } else {
430 // Vector system
431 AmalgamationFactory<SC, LO, GO, NO>::AmalgamateMap(rcp_dynamic_cast<const StridedMap>(coarseMap), coarseCoordMap);
432 }
433
434 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors(), false);
435
436 // Create overlapped fine coordinates to reduce global communication
437 auto uniqueMap = fineCoords->getMap();
438 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
439 if (aggregates->AggregatesCrossProcessors()) {
440 auto nonUniqueMap = aggregates->GetMap();
441 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
442
443 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors(), false);
444 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
445 }
446
447 // The good new is that his graph has already been constructed for the
448 // TentativePFactory and was cached in Aggregates. So this is a no-op.
449 auto aggGraph = aggregates->GetGraph();
450 auto numAggs = aggGraph.numRows();
451
452 auto fineCoordsView = fineCoords->getLocalViewDevice(Tpetra::Access::ReadOnly);
453 auto coarseCoordsView = coarseCoords->getLocalViewDevice(Tpetra::Access::OverwriteAll);
454
455 // Fill in coarse coordinates
456 {
457 SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
458
459 const auto dim = fineCoords->getNumVectors();
460
461 typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
462 for (size_t j = 0; j < dim; j++) {
463 Kokkos::parallel_for(
464 "MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
465 KOKKOS_LAMBDA(const LO i) {
466 // A row in this graph represents all node ids in the aggregate
467 // Therefore, averaging is very easy
468
469 auto aggregate = aggGraph.rowConst(i);
470
471 coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
472 for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
473 sum += fineCoordsRandomView(aggregate(colID), j);
474
475 coarseCoordsView(i, j) = sum / aggregate.length;
476 });
477 }
478 }
479 }
480
481 if (!aggregates->AggregatesCrossProcessors()) {
482 if (Xpetra::Helpers<SC, LO, GO, NO>::isTpetraBlockCrs(A)) {
483 BuildPuncoupledBlockCrs(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
484 coarseLevel.GetLevelID());
485 } else {
486 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
487 }
488 } else
489 BuildPcoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
490
491 // If available, use striding information of fine level matrix A for range
492 // map and coarseMap as domain map; otherwise use plain range map of
493 // Ptent = plain range map of A for range map and coarseMap as domain map.
494 // NOTE:
495 // The latter is not really safe, since there is no striding information
496 // for the range map. This is not really a problem, since striding
497 // information is always available on the intermedium levels and the
498 // coarsest levels.
499 if (A->IsView("stridedMaps") == true)
500 Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
501
502 if (bTransferCoordinates_) {
503 Set(coarseLevel, "Coordinates", coarseCoords);
504 }
505
506 // FIXME: We should remove the NodeComm on levels past the threshold
507 if (fineLevel.IsAvailable("Node Comm")) {
508 RCP<const Teuchos::Comm<int>> nodeComm = Get<RCP<const Teuchos::Comm<int>>>(fineLevel, "Node Comm");
509 Set<RCP<const Teuchos::Comm<int>>>(coarseLevel, "Node Comm", nodeComm);
510 }
511
512 Set(coarseLevel, "Nullspace", coarseNullspace);
513 Set(coarseLevel, "P", Ptentative);
514
515 if (pL.get<bool>("sa: keep tentative prolongator")) {
516 coarseLevel.Set("Ptent", Ptentative, NoFactory::get());
517 coarseLevel.AddKeepFlag("Ptent", NoFactory::get(), MueLu::Final);
518 }
519
520 if (IsPrint(Statistics2)) {
521 RCP<ParameterList> params = rcp(new ParameterList());
522 params->set("printLoadBalancingInfo", true);
523 GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
524 }
525}
526
527template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
529 BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
530 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
531 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
532 RCP<MultiVector>& coarseNullspace, const int levelID) const {
533 auto rowMap = A->getRowMap();
534 auto colMap = A->getColMap();
535
536 const size_t numRows = rowMap->getLocalNumElements();
537 const size_t NSDim = fineNullspace->getNumVectors();
538
539 typedef KokkosKernels::ArithTraits<SC> ATS;
540 using impl_SC = typename ATS::val_type;
541 using impl_ATS = KokkosKernels::ArithTraits<impl_SC>;
542 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
543
544 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
545
546 typename Aggregates::local_graph_type aggGraph;
547 {
548 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
549 aggGraph = aggregates->GetGraph();
550 }
551 auto aggRows = aggGraph.row_map;
552 auto aggCols = aggGraph.entries;
553
554 // Aggregates map is based on the amalgamated column map
555 // We can skip global-to-local conversion if LIDs in row map are
556 // same as LIDs in column map
557 bool goodMap;
558 {
559 SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
560 goodMap = isGoodMap(*rowMap, *colMap);
561 }
562 // FIXME_KOKKOS: need to proofread later code for bad maps
563 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
564 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
565 "(i.e. \"matching\" row and column maps)");
566
567 // STEP 1: do unamalgamation
568 // The non-kokkos version uses member functions from the AmalgamationInfo
569 // container class to unamalgamate the data. In contrast, the kokkos
570 // version of TentativePFactory does the unamalgamation here and only uses
571 // the data of the AmalgamationInfo container class
572
573 // Extract information for unamalgamation
574 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
575 GO indexBase;
576 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
577 GO globalOffset = amalgInfo->GlobalOffset();
578
579 // Extract aggregation info (already in Kokkos host views)
580 auto procWinner = aggregates->GetProcWinner()->getLocalViewDevice(Tpetra::Access::ReadOnly);
581 auto vertex2AggId = aggregates->GetVertex2AggId()->getLocalViewDevice(Tpetra::Access::ReadOnly);
582 const size_t numAggregates = aggregates->GetNumAggregates();
583
584 int myPID = aggregates->GetMap()->getComm()->getRank();
585
586 // Create Kokkos::View (on the device) to store the aggreate dof sizes
587 // Later used to get aggregate dof offsets
588 // NOTE: This zeros itself on construction
589 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
590 AggSizeType aggDofSizes;
591
592 if (stridedBlockSize == 1) {
593 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
594
595 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
596 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
597
598 auto sizesConst = aggregates->ComputeAggregateSizes();
599 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), sizesConst);
600
601 } else {
602 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
603
604 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
605 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
606
607 auto nodeMap = aggregates->GetMap()->getLocalMap();
608 auto dofMap = colMap->getLocalMap();
609
610 Kokkos::parallel_for(
611 "MueLu:TentativePF:Build:compute_agg_sizes", range_type(0, numAggregates),
612 KOKKOS_LAMBDA(const LO agg) {
613 auto aggRowView = aggGraph.rowConst(agg);
614
615 size_t size = 0;
616 for (LO colID = 0; colID < aggRowView.length; colID++) {
617 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
618
619 for (LO k = 0; k < stridedBlockSize; k++) {
620 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
621
622 if (dofMap.getLocalElement(dofGID) != INVALID)
623 size++;
624 }
625 }
626 aggDofSizes(agg + 1) = size;
627 });
628 }
629
630 // Find maximum dof size for aggregates
631 // Later used to reserve enough scratch space for local QR decompositions
632 LO maxAggSize = 0;
633 ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
634 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
635
636 // parallel_scan (exclusive)
637 // The aggDofSizes View then contains the aggregate dof offsets
638 Kokkos::parallel_scan(
639 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
640 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
641 update += aggDofSizes(i);
642 if (final_pass)
643 aggDofSizes(i) = update;
644 });
645
646 // Create Kokkos::View on the device to store mapping
647 // between (local) aggregate id and row map ids (LIDs)
648 Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
649 {
650 SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
651
652 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
653 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
654
655 Kokkos::parallel_for(
656 "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
657 KOKKOS_LAMBDA(const LO lnode) {
658 if (procWinner(lnode, 0) == myPID) {
659 // No need for atomics, it's one-to-one
660 auto aggID = vertex2AggId(lnode, 0);
661
662 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
663 // FIXME: I think this may be wrong
664 // We unconditionally add the whole block here. When we calculated
665 // aggDofSizes, we did the isLocalElement check. Something's fishy.
666 for (LO k = 0; k < stridedBlockSize; k++)
667 agg2RowMapLO(offset + k) = lnode * stridedBlockSize + k;
668 }
669 });
670 }
671
672 // STEP 2: prepare local QR decomposition
673 // Reserve memory for tentative prolongation operator
674 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim, true);
675
676 // Pull out the nullspace vectors so that we can have random access (on the device)
677 auto fineNS = fineNullspace->getLocalViewDevice(Tpetra::Access::ReadWrite);
678 auto coarseNS = coarseNullspace->getLocalViewDevice(Tpetra::Access::ReadWrite);
679
680 size_t nnz = 0; // actual number of nnz
681
682 typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_device_type local_matrix_type;
683 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
684 typedef typename local_matrix_type::index_type::non_const_type cols_type;
685 typedef typename local_matrix_type::values_type::non_const_type vals_type;
686
687 // Device View for status (error messages...)
688 typedef Kokkos::View<int[10], DeviceType> status_type;
689 status_type status("status");
690
691 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
693
694 const ParameterList& pL = GetParameterList();
695 const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
696 if (!doQRStep) {
697 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
698 if (NSDim > 1)
699 GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
700 }
701
702 size_t nnzEstimate = numRows * NSDim;
703 rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows + 1);
704 cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
705 vals_type valsAux("Ptent_aux_vals", nnzEstimate);
706 rows_type rows("Ptent_rows", numRows + 1);
707 {
708 // Stage 0: fill in views.
709 SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
710
711 // The main thing to notice is initialization of vals with INVALID. These
712 // values will later be used to compress the arrays
713 Kokkos::parallel_for(
714 "MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows + 1),
715 KOKKOS_LAMBDA(const LO row) {
716 rowsAux(row) = row * NSDim;
717 });
718 Kokkos::parallel_for(
719 "MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
720 KOKKOS_LAMBDA(const LO j) {
721 colsAux(j) = INVALID;
722 });
723 }
724
725 if (NSDim == 1) {
726 // 1D is special, as it is the easiest. We don't even need to the QR,
727 // just normalize an array. Plus, no worries abot small aggregates. In
728 // addition, we do not worry about compression. It is unlikely that
729 // nullspace will have zeros. If it does, a prolongator row would be
730 // zero and we'll get singularity anyway.
731 SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
732
733 // Set up team policy with numAggregates teams and one thread per team.
734 // Each team handles a slice of the data associated with one aggregate
735 // and performs a local QR decomposition (in this case real QR is
736 // unnecessary).
737 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
738
739 if (doQRStep) {
740 Kokkos::parallel_for(
741 "MueLu:TentativePF:BuildUncoupled:main_loop", policy,
742 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
743 auto agg = thread.league_rank();
744
745 // size of the aggregate (number of DOFs in aggregate)
746 LO aggSize = aggRows(agg + 1) - aggRows(agg);
747
748 // Extract the piece of the nullspace corresponding to the aggregate, and
749 // put it in the flat array, "localQR" (in column major format) for the
750 // QR routine. Trivial in 1D.
751 auto norm = impl_ATS::magnitude(zero);
752
753 // Calculate QR by hand
754 // FIXME: shouldn't there be stridedblock here?
755 // FIXME_KOKKOS: shouldn't there be stridedblock here?
756 for (decltype(aggSize) k = 0; k < aggSize; k++) {
757 auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0));
758 norm += dnorm * dnorm;
759 }
760 norm = sqrt(norm);
761
762 if (norm == zero) {
763 // zero column; terminate the execution
764 statusAtomic(1) = true;
765 return;
766 }
767
768 // R = norm
769 coarseNS(agg, 0) = norm;
770
771 // Q = localQR(:,0)/norm
772 for (decltype(aggSize) k = 0; k < aggSize; k++) {
773 LO localRow = agg2RowMapLO(aggRows(agg) + k);
774 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0) / norm;
775
776 rows(localRow + 1) = 1;
777 colsAux(localRow) = agg;
778 valsAux(localRow) = localVal;
779 }
780 });
781
782 typename status_type::host_mirror_type statusHost = Kokkos::create_mirror_view(status);
783 Kokkos::deep_copy(statusHost, status);
784 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
785 if (statusHost(i)) {
786 std::ostringstream oss;
787 oss << "MueLu::TentativePFactory::MakeTentative: ";
788 switch (i) {
789 case 0: oss << "!goodMap is not implemented"; break;
790 case 1: oss << "fine level NS part has a zero column"; break;
791 }
792 throw Exceptions::RuntimeError(oss.str());
793 }
794
795 } else {
796 Kokkos::parallel_for(
797 "MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
798 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
799 auto agg = thread.league_rank();
800
801 // size of the aggregate (number of DOFs in aggregate)
802 LO aggSize = aggRows(agg + 1) - aggRows(agg);
803
804 // R = norm
805 coarseNS(agg, 0) = one;
806
807 // Q = localQR(:,0)/norm
808 for (decltype(aggSize) k = 0; k < aggSize; k++) {
809 LO localRow = agg2RowMapLO(aggRows(agg) + k);
810 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0);
811
812 rows(localRow + 1) = 1;
813 colsAux(localRow) = agg;
814 valsAux(localRow) = localVal;
815 }
816 });
817 }
818
819 Kokkos::parallel_reduce(
820 "MueLu:TentativeP:CountNNZ", range_type(0, numRows + 1),
821 KOKKOS_LAMBDA(const LO i, size_t& nnz_count) {
822 nnz_count += rows(i);
823 },
824 nnz);
825
826 } else { // NSdim > 1
827 // FIXME_KOKKOS: This code branch is completely unoptimized.
828 // Work to do:
829 // - Optimize QR decomposition
830 // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
831 // packing new values in the beginning of each row
832 // We do use auxilary view in this case, so keep a second rows view for
833 // counting nonzeros in rows
834
835 {
836 SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
837 // Set up team policy with numAggregates teams and one thread per team.
838 // Each team handles a slice of the data associated with one aggregate
839 // and performs a local QR decomposition
840 Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1); // numAggregates teams a 1 thread
841 LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
842 decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
843 decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
844 decltype(valsAux)>
845 localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
847 if (doQRStep) {
848 using shared_matrix = typename decltype(localQRFunctor)::shared_matrix;
849 int m = maxAggSize;
850 int n = fineNSRandom.extent(1);
851 int size = shared_matrix::shmem_size(m, n) + // r
852 shared_matrix::shmem_size(m, m); // q
853
854 if (size < policy.scratch_size_max(/*level=*/(int)0))
855 policy.set_scratch_size(/*level=*/(int)0, Kokkos::PerTeam(size));
856 else if (size < policy.scratch_size_max(/*level=*/(int)1))
857 policy.set_scratch_size(/*level=*/(int)1, Kokkos::PerTeam(size));
858 else
859 throw Exceptions::RuntimeError("Neither L0 scratch memory (max size " + std::to_string(policy.scratch_size_max((int)0)) +
860 "), nor L1 scratch memory (max size " + std::to_string(policy.scratch_size_max((int)1)) +
861 ") is large enough for requested allocation of size " + std::to_string(size));
862 }
863 Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
864 }
865
866 typename status_type::host_mirror_type statusHost = Kokkos::create_mirror_view(status);
867 Kokkos::deep_copy(statusHost, status);
868 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
869 if (statusHost(i)) {
870 std::ostringstream oss;
871 oss << "MueLu::TentativePFactory::MakeTentative: ";
872 switch (i) {
873 case 0: oss << "!goodMap is not implemented"; break;
874 case 1: oss << "fine level NS part has a zero column"; break;
875 }
876 throw Exceptions::RuntimeError(oss.str());
877 }
878 }
879
880 // Compress the cols and vals by ignoring INVALID column entries that correspond
881 // to 0 in QR.
882
883 // The real cols and vals are constructed using calculated (not estimated) nnz
884 cols_type cols;
885 vals_type vals;
886
887 if (nnz != nnzEstimate) {
888 {
889 // Stage 2: compress the arrays
890 SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
891
892 Kokkos::parallel_scan(
893 "MueLu:TentativePF:Build:compress_rows", range_type(0, numRows + 1),
894 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
895 upd += rows(i);
896 if (final)
897 rows(i) = upd;
898 });
899 }
900
901 {
902 SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
903
904 cols = cols_type("Ptent_cols", nnz);
905 vals = vals_type("Ptent_vals", nnz);
906
907 // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
908 // to the beginning of rows. See CoalesceDropFactory_kokkos for
909 // example.
910 Kokkos::parallel_for(
911 "MueLu:TentativePF:Build:compress_cols_vals", range_type(0, numRows),
912 KOKKOS_LAMBDA(const LO i) {
913 LO rowStart = rows(i);
914
915 size_t lnnz = 0;
916 for (auto j = rowsAux(i); j < rowsAux(i + 1); j++)
917 if (colsAux(j) != INVALID) {
918 cols(rowStart + lnnz) = colsAux(j);
919 vals(rowStart + lnnz) = valsAux(j);
920 lnnz++;
921 }
922 });
923 }
924
925 } else {
926 rows = rowsAux;
927 cols = colsAux;
928 vals = valsAux;
929 }
930
931 GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
932
933 {
934 // Stage 3: construct Xpetra::Matrix
935 SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
936
937 local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
938
939 // Managing labels & constants for ESFC
940 RCP<ParameterList> FCparams;
941 if (pL.isSublist("matrixmatrix: kernel params"))
942 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
943 else
944 FCparams = rcp(new ParameterList);
945
946 // By default, we don't need global constants for TentativeP
947 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
948 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
949
950 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
951 Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
952 }
953}
954
955template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
957 BuildPuncoupledBlockCrs(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
958 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
959 RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
960 RCP<MultiVector>& coarseNullspace, const int levelID) const {
961 /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could
962 be generalized later, if we ever need to do so:
963 1) Null space dimension === block size of matrix: So no elasticity right now
964 2) QR is not supported: Under assumption #1, this shouldn't cause problems.
965 3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
966
967 These assumptions keep our code way simpler and still support the use cases we actually care about.
968 */
969
970 RCP<const Map> rowMap = A->getRowMap();
971 RCP<const Map> rangeMap = A->getRangeMap();
972 RCP<const Map> colMap = A->getColMap();
973 // const size_t numFinePointRows = rangeMap->getLocalNumElements();
974 const size_t numFineBlockRows = rowMap->getLocalNumElements();
975
976 // typedef Teuchos::ScalarTraits<SC> STS;
977 // typedef typename STS::magnitudeType Magnitude;
978 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
979
980 typedef KokkosKernels::ArithTraits<SC> ATS;
981 using impl_SC = typename ATS::val_type;
982 using impl_ATS = KokkosKernels::ArithTraits<impl_SC>;
983 const impl_SC one = impl_ATS::one();
984
985 // const GO numAggs = aggregates->GetNumAggregates();
986 const size_t NSDim = fineNullspace->getNumVectors();
987 auto aggSizes = aggregates->ComputeAggregateSizes();
988
989 typename Aggregates::local_graph_type aggGraph;
990 {
991 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
992 aggGraph = aggregates->GetGraph();
993 }
994 auto aggRows = aggGraph.row_map;
995 auto aggCols = aggGraph.entries;
996
997 // Need to generate the coarse block map
998 // NOTE: We assume NSDim == block size here
999 // NOTE: We also assume that coarseMap has contiguous GIDs
1000 // const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
1001 const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1002 RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1003 Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
1004 numCoarseBlockRows,
1005 coarsePointMap->getIndexBase(),
1006 coarsePointMap->getComm());
1007 // Sanity checking
1008 const ParameterList& pL = GetParameterList();
1009 // const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
1010
1011 // The aggregates use the amalgamated column map, which in this case is what we want
1012
1013 // Aggregates map is based on the amalgamated column map
1014 // We can skip global-to-local conversion if LIDs in row map are
1015 // same as LIDs in column map
1016 bool goodMap = MueLu::Utilities<SC, LO, GO, NO>::MapsAreNested(*rowMap, *colMap);
1017 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
1018 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1019 "(i.e. \"matching\" row and column maps)");
1020
1021 // STEP 1: do unamalgamation
1022 // The non-kokkos version uses member functions from the AmalgamationInfo
1023 // container class to unamalgamate the data. In contrast, the kokkos
1024 // version of TentativePFactory does the unamalgamation here and only uses
1025 // the data of the AmalgamationInfo container class
1026
1027 // Extract information for unamalgamation
1028 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1029 GO indexBase;
1030 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1031 // GO globalOffset = amalgInfo->GlobalOffset();
1032
1033 // Extract aggregation info (already in Kokkos host views)
1034 auto procWinner = aggregates->GetProcWinner()->getLocalViewDevice(Tpetra::Access::ReadOnly);
1035 auto vertex2AggId = aggregates->GetVertex2AggId()->getLocalViewDevice(Tpetra::Access::ReadOnly);
1036 const size_t numAggregates = aggregates->GetNumAggregates();
1037
1038 int myPID = aggregates->GetMap()->getComm()->getRank();
1039
1040 // Create Kokkos::View (on the device) to store the aggreate dof sizes
1041 // Later used to get aggregate dof offsets
1042 // NOTE: This zeros itself on construction
1043 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
1044 AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
1045
1046 {
1047 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
1048
1049 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
1050 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
1051
1052 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), aggSizes);
1053 }
1054
1055 // Find maximum dof size for aggregates
1056 // Later used to reserve enough scratch space for local QR decompositions
1057 LO maxAggSize = 0;
1058 ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
1059 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1060
1061 // parallel_scan (exclusive)
1062 // The aggDofSizes View then contains the aggregate dof offsets
1063 Kokkos::parallel_scan(
1064 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
1065 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
1066 update += aggDofSizes(i);
1067 if (final_pass)
1068 aggDofSizes(i) = update;
1069 });
1070
1071 // Create Kokkos::View on the device to store mapping
1072 // between (local) aggregate id and row map ids (LIDs)
1073 Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
1074 {
1075 SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
1076
1077 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
1078 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1079
1080 Kokkos::parallel_for(
1081 "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
1082 KOKKOS_LAMBDA(const LO lnode) {
1083 if (procWinner(lnode, 0) == myPID) {
1084 // No need for atomics, it's one-to-one
1085 auto aggID = vertex2AggId(lnode, 0);
1086
1087 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
1088 // FIXME: I think this may be wrong
1089 // We unconditionally add the whole block here. When we calculated
1090 // aggDofSizes, we did the isLocalElement check. Something's fishy.
1091 for (LO k = 0; k < stridedBlockSize; k++)
1092 aggToRowMapLO(offset + k) = lnode * stridedBlockSize + k;
1093 }
1094 });
1095 }
1096
1097 // STEP 2: prepare local QR decomposition
1098 // Reserve memory for tentative prolongation operator
1099 coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim, true);
1100
1101 // Pull out the nullspace vectors so that we can have random access (on the device)
1102 auto fineNS = fineNullspace->getLocalViewDevice(Tpetra::Access::ReadWrite);
1103 auto coarseNS = coarseNullspace->getLocalViewDevice(Tpetra::Access::ReadWrite);
1104
1105 typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_device_type local_matrix_type;
1106 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1107 typedef typename local_matrix_type::index_type::non_const_type cols_type;
1108 // typedef typename local_matrix_type::values_type::non_const_type vals_type;
1109
1110 // Device View for status (error messages...)
1111 typedef Kokkos::View<int[10], DeviceType> status_type;
1112 status_type status("status");
1113
1114 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
1116
1117 // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
1118 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
1119
1120 // BlockCrs requires that we build the (block) graph first, so let's do that...
1121
1122 // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one
1123 // block non-zero per row in the matrix;
1124 rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1125 cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
1126
1127 Kokkos::parallel_for(
1128 "MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
1129 KOKKOS_LAMBDA(const LO j) {
1130 ia[j] = j;
1131 ja[j] = INVALID;
1132
1133 if (j == (LO)numFineBlockRows - 1)
1134 ia[numFineBlockRows] = numFineBlockRows;
1135 });
1136
1137 // Fill Graph
1138 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1139 Kokkos::parallel_for(
1140 "MueLu:TentativePF:BlockCrs:fillGraph", policy,
1141 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1142 auto agg = thread.league_rank();
1143 Xpetra::global_size_t offset = agg;
1144
1145 // size of the aggregate (number of DOFs in aggregate)
1146 LO aggSize = aggRows(agg + 1) - aggRows(agg);
1147
1148 for (LO j = 0; j < aggSize; j++) {
1149 // FIXME: Allow for bad maps
1150 const LO localRow = aggToRowMapLO[aggDofSizes[agg] + j];
1151 const size_t rowStart = ia[localRow];
1152 ja[rowStart] = offset;
1153 }
1154 });
1155
1156 // Compress storage (remove all INVALID, which happen when we skip zeros)
1157 // We do that in-place
1158 {
1159 // Stage 2: compress the arrays
1160 SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
1161 // Fill i_temp with the correct row starts
1162 rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1163 LO nnz = 0;
1164 Kokkos::parallel_scan(
1165 "MueLu:TentativePF:BlockCrs:compress_rows", range_type(0, numFineBlockRows),
1166 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
1167 if (final)
1168 i_temp[i] = upd;
1169 for (auto j = ia[i]; j < ia[i + 1]; j++)
1170 if (ja[j] != INVALID)
1171 upd++;
1172 if (final && i == (LO)numFineBlockRows - 1)
1173 i_temp[numFineBlockRows] = upd;
1174 },
1175 nnz);
1176
1177 cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
1178
1179 Kokkos::parallel_for(
1180 "MueLu:TentativePF:BlockCrs:compress_cols", range_type(0, numFineBlockRows),
1181 KOKKOS_LAMBDA(const LO i) {
1182 size_t rowStart = i_temp[i];
1183 size_t lnnz = 0;
1184 for (auto j = ia[i]; j < ia[i + 1]; j++)
1185 if (ja[j] != INVALID) {
1186 j_temp[rowStart + lnnz] = ja[j];
1187 lnnz++;
1188 }
1189 });
1190
1191 ia = i_temp;
1192 ja = j_temp;
1193 }
1194
1195 RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap, coarseBlockMap, ia, ja);
1196
1197 // Managing labels & constants for ESFC
1198 {
1199 RCP<ParameterList> FCparams;
1200 if (pL.isSublist("matrixmatrix: kernel params"))
1201 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
1202 else
1203 FCparams = rcp(new ParameterList);
1204 // By default, we don't need global constants for TentativeP
1205 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
1206 std::string levelIDs = toString(levelID);
1207 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + levelIDs);
1208 RCP<const Export> dummy_e;
1209 RCP<const Import> dummy_i;
1210 BlockGraph->expertStaticFillComplete(coarseBlockMap, rowMap, dummy_i, dummy_e, FCparams);
1211 }
1212
1213 // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
1214 // we clear them here
1215 ia = rows_type();
1216 ja = cols_type();
1217
1218 // Now let's make a BlockCrs Matrix
1219 // NOTE: Assumes block size== NSDim
1220 RCP<Xpetra::CrsMatrix<SC, LO, GO, NO>> P_xpetra = Xpetra::CrsMatrixFactory<SC, LO, GO, NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap, NSDim);
1221 RCP<Xpetra::TpetraBlockCrsMatrix<SC, LO, GO, NO>> P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC, LO, GO, NO>>(P_xpetra);
1222 if (P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1223 RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
1224
1225 auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1226 const LO stride = NSDim * NSDim;
1227
1228 Kokkos::parallel_for(
1229 "MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1230 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1231 auto agg = thread.league_rank();
1232
1233 // size of the aggregate (number of DOFs in aggregate)
1234 LO aggSize = aggRows(agg + 1) - aggRows(agg);
1235 Xpetra::global_size_t offset = agg * NSDim;
1236
1237 // Q = localQR(:,0)/norm
1238 for (LO j = 0; j < aggSize; j++) {
1239 LO localBlockRow = aggToRowMapLO(aggRows(agg) + j);
1240 LO rowStart = localBlockRow * stride;
1241 for (LO r = 0; r < (LO)NSDim; r++) {
1242 LO localPointRow = localBlockRow * NSDim + r;
1243 for (LO c = 0; c < (LO)NSDim; c++) {
1244 values[rowStart + r * NSDim + c] = fineNSRandom(localPointRow, c);
1245 }
1246 }
1247 }
1248
1249 // R = norm
1250 for (LO j = 0; j < (LO)NSDim; j++)
1251 coarseNS(offset + j, j) = one;
1252 });
1253
1254 Ptentative = P_wrap;
1255}
1256
1257template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1259 BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates> /* aggregates */,
1260 RCP<AmalgamationInfo> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
1261 RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
1262 RCP<MultiVector>& /* coarseNullspace */) const {
1263 throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
1264}
1265
1266template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1268 isGoodMap(const Map& rowMap, const Map& colMap) const {
1269 auto rowLocalMap = rowMap.getLocalMap();
1270 auto colLocalMap = colMap.getLocalMap();
1271
1272 const size_t numRows = rowLocalMap.getLocalNumElements();
1273 const size_t numCols = colLocalMap.getLocalNumElements();
1274
1275 if (numCols < numRows)
1276 return false;
1277
1278 size_t numDiff = 0;
1279 Kokkos::parallel_reduce(
1280 "MueLu:TentativePF:isGoodMap", range_type(0, numRows),
1281 KOKKOS_LAMBDA(const LO i, size_t& diff) {
1282 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1283 },
1284 numDiff);
1285
1286 return (numDiff == 0);
1287}
1288
1289} // namespace MueLu
1290
1291#define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1292#endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
#define SET_VALID_ENTRY(name)
maxAggDofSizeType maxAggDofSize
agg2RowMapLOType agg2RowMapLO
MueLu::DefaultLocalOrdinal LocalOrdinal
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
typename LWGraph_kokkos::local_graph_type local_graph_type
static void AmalgamateMap(const Map &sourceMap, const Matrix &A, RCP< const Map > &amalgamatedMap, Array< LO > &translation)
Method to create merged map for systems of PDEs.
Exception throws to report errors in the internal logical of the program.
Timer to be used in factories. Similar to Monitor but with additional timers.
Class that holds all level-specific information.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need's value has been saved.
int GetLevelID() const
Return level number.
void AddKeepFlag(const std::string &ename, const FactoryBase *factory=NoFactory::get(), KeepType keep=MueLu::Keep)
void Set(const std::string &ename, const T &entry, const FactoryBase *factory=NoFactory::get())
static const NoFactory * get()
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
bool isGoodMap(const Map &rowMap, const Map &colMap) const
void BuildP(Level &fineLevel, Level &coarseLevel) const
Abstract Build method.
void BuildPuncoupledBlockCrs(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
void BuildPuncoupled(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
void Build(Level &fineLevel, Level &coarseLevel) const
Build an object with this factory.
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
void DeclareInput(Level &fineLevel, Level &coarseLevel) const
Input.
void BuildPcoupled(RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace) const
static bool MapsAreNested(const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &rowMap, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &colMap)
Namespace for MueLu classes and methods.
@ Final
Keep data only for this run. Used to keep data useful for Hierarchy::Iterate(). Data will be deleted ...
@ Warnings0
Important warning messages (one line)
@ Statistics2
Print even more statistics.
@ Runtime1
Description of what is happening (more verbose)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.