MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// MueLu: A package for multigrid based preconditioning
4//
5// Copyright 2012 NTESS and the MueLu contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
11#define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
12
13#include "Kokkos_UnorderedMap.hpp"
14#include "Xpetra_CrsGraphFactory.hpp"
15
17
18#include "MueLu_Aggregates.hpp"
19#include "MueLu_AmalgamationInfo.hpp"
20#include "MueLu_AmalgamationFactory.hpp"
21
22#include "MueLu_MasterList.hpp"
23#include "MueLu_PerfUtils.hpp"
24#include "MueLu_Monitor.hpp"
25
26#include "Xpetra_IO.hpp"
27
28namespace MueLu {
29
30namespace { // anonymous
31
32template <class LocalOrdinal, class View>
33class ReduceMaxFunctor {
34 public:
35 ReduceMaxFunctor(View view)
36 : view_(view) {}
37
38 KOKKOS_INLINE_FUNCTION
39 void operator()(const LocalOrdinal& i, LocalOrdinal& vmax) const {
40 if (vmax < view_(i))
41 vmax = view_(i);
42 }
43
44 KOKKOS_INLINE_FUNCTION
45 void join(LocalOrdinal& dst, const LocalOrdinal& src) const {
46 if (dst < src) {
47 dst = src;
48 }
49 }
50
51 KOKKOS_INLINE_FUNCTION
52 void init(LocalOrdinal& dst) const {
53 dst = 0;
54 }
55
56 private:
57 View view_;
58};
59
60// local QR decomposition
61template <class LOType, class GOType, class SCType, class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
62class LocalQRDecompFunctor {
63 private:
64 typedef LOType LO;
65 typedef GOType GO;
66 typedef SCType SC;
67
68 typedef typename DeviceType::execution_space execution_space;
69 typedef typename KokkosKernels::ArithTraits<SC>::val_type impl_SC;
70 typedef KokkosKernels::ArithTraits<impl_SC> impl_ATS;
71 typedef typename impl_ATS::magnitudeType Magnitude;
72
73 public:
74 typedef Kokkos::View<impl_SC**, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
75 typedef Kokkos::View<impl_SC*, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
76
77 private:
78 NspType fineNS;
79 NspType coarseNS;
80 aggRowsType aggRows;
81 maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
82 agg2RowMapLOType agg2RowMapLO;
83 statusType statusAtomic;
84 rowsType rows;
85 rowsAuxType rowsAux;
86 colsAuxType colsAux;
87 valsAuxType valsAux;
89
90 public:
91 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_)
92 : fineNS(fineNS_)
93 , coarseNS(coarseNS_)
94 , aggRows(aggRows_)
95 , maxAggDofSize(maxAggDofSize_)
96 , agg2RowMapLO(agg2RowMapLO_)
97 , statusAtomic(statusAtomic_)
98 , rows(rows_)
99 , rowsAux(rowsAux_)
100 , colsAux(colsAux_)
101 , valsAux(valsAux_)
102 , doQRStep(doQRStep_) {}
103
104 KOKKOS_INLINE_FUNCTION
105 void operator()(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread, size_t& nnz) const {
106 auto agg = thread.league_rank();
107
108 // size of aggregate: number of DOFs in aggregate
109 auto aggSize = aggRows(agg + 1) - aggRows(agg);
110
111 const impl_SC one = impl_ATS::one();
112 const impl_SC two = one + one;
113 const impl_SC zero = impl_ATS::zero();
114 const auto zeroM = impl_ATS::magnitude(zero);
115
116 int m = aggSize;
117 int n = fineNS.extent(1);
118
119 // calculate row offset for coarse nullspace
120 Xpetra::global_size_t offset = agg * n;
121
122 if (doQRStep) {
123 // Extract the piece of the nullspace corresponding to the aggregate
124 shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
125 for (int j = 0; j < n; j++)
126 for (int k = 0; k < m; k++)
127 r(k, j) = fineNS(agg2RowMapLO(aggRows(agg) + k), j);
128#if 0
129 printf("A\n");
130 for (int i = 0; i < m; i++) {
131 for (int j = 0; j < n; j++)
132 printf(" %5.3lf ", r(i,j));
133 printf("\n");
134 }
135#endif
136
137 // Calculate QR decomposition (standard)
138 shared_matrix q(thread.team_shmem(), m, m); // Q
139 if (m >= n) {
140 bool isSingular = false;
141
142 // Initialize Q^T
143 auto qt = q;
144 for (int i = 0; i < m; i++) {
145 for (int j = 0; j < m; j++)
146 qt(i, j) = zero;
147 qt(i, i) = one;
148 }
149
150 for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
151 // FIXME_KOKKOS: use team
152 Magnitude s = zeroM, norm, norm_x;
153 for (int i = k + 1; i < m; i++)
154 s += pow(impl_ATS::magnitude(r(i, k)), 2);
155 norm = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
156
157 if (norm == zero) {
158 isSingular = true;
159 break;
160 }
161
162 r(k, k) -= norm * one;
163
164 norm_x = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
165 if (norm_x == zeroM) {
166 // We have a single diagonal element in the column.
167 // No reflections required. Just need to restor r(k,k).
168 r(k, k) = norm * one;
169 continue;
170 }
171
172 // FIXME_KOKKOS: use team
173 for (int i = k; i < m; i++)
174 r(i, k) /= norm_x;
175
176 // Update R(k:m,k+1:n)
177 for (int j = k + 1; j < n; j++) {
178 // FIXME_KOKKOS: use team in the loops
179 impl_SC si = zero;
180 for (int i = k; i < m; i++)
181 si += r(i, k) * r(i, j);
182 for (int i = k; i < m; i++)
183 r(i, j) -= two * si * r(i, k);
184 }
185
186 // Update Q^T (k:m,k:m)
187 for (int j = k; j < m; j++) {
188 // FIXME_KOKKOS: use team in the loops
189 impl_SC si = zero;
190 for (int i = k; i < m; i++)
191 si += r(i, k) * qt(i, j);
192 for (int i = k; i < m; i++)
193 qt(i, j) -= two * si * r(i, k);
194 }
195
196 // Fix R(k:m,k)
197 r(k, k) = norm * one;
198 for (int i = k + 1; i < m; i++)
199 r(i, k) = zero;
200 }
201
202#if 0
203 // Q = (Q^T)^T
204 for (int i = 0; i < m; i++)
205 for (int j = 0; j < i; j++) {
206 impl_SC tmp = qt(i,j);
207 qt(i,j) = qt(j,i);
208 qt(j,i) = tmp;
209 }
210#endif
211
212 // Build coarse nullspace using the upper triangular part of R
213 for (int j = 0; j < n; j++)
214 for (int k = 0; k <= j; k++)
215 coarseNS(offset + k, j) = r(k, j);
216
217 if (isSingular) {
218 statusAtomic(1) = true;
219 return;
220 }
221
222 } else {
223 // Special handling for m < n (i.e. single node aggregates in structural mechanics)
224
225 // The local QR decomposition is not possible in the "overconstrained"
226 // case (i.e. number of columns in qr > number of rowsAux), which
227 // corresponds to #DOFs in Aggregate < n. For usual problems this
228 // is only possible for single node aggregates in structural mechanics.
229 // (Similar problems may arise in discontinuous Galerkin problems...)
230 // We bypass the QR decomposition and use an identity block in the
231 // tentative prolongator for the single node aggregate and transfer the
232 // corresponding fine level null space information 1-to-1 to the coarse
233 // level null space part.
234
235 // NOTE: The resulting tentative prolongation operator has
236 // (m*DofsPerNode-n) zero columns leading to a singular
237 // coarse level operator A. To deal with that one has the following
238 // options:
239 // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
240 // false) to add some identity block to the diagonal of the zero rowsAux
241 // in the coarse level operator A, such that standard level smoothers
242 // can be used again.
243 // - Use special (projection-based) level smoothers, which can deal
244 // with singular matrices (very application specific)
245 // - Adapt the code below to avoid zero columns. However, we do not
246 // support a variable number of DOFs per node in MueLu/Xpetra which
247 // makes the implementation really hard.
248 //
249 // FIXME: do we need to check for singularity here somehow? Zero
250 // columns would be easy but linear dependency would require proper QR.
251
252 // R = extended (by adding identity rowsAux) qr
253 for (int j = 0; j < n; j++)
254 for (int k = 0; k < n; k++)
255 if (k < m)
256 coarseNS(offset + k, j) = r(k, j);
257 else
258 coarseNS(offset + k, j) = (k == j ? one : zero);
259
260 // Q = I (rectangular)
261 for (int i = 0; i < m; i++)
262 for (int j = 0; j < n; j++)
263 q(i, j) = (j == i ? one : zero);
264 }
265
266 // Process each row in the local Q factor and fill helper arrays to assemble P
267 for (int j = 0; j < m; j++) {
268 LO localRow = agg2RowMapLO(aggRows(agg) + j);
269 size_t rowStart = rowsAux(localRow);
270 size_t lnnz = 0;
271 for (int k = 0; k < n; k++) {
272 // skip zeros
273 if (q(j, k) != zero) {
274 colsAux(rowStart + lnnz) = offset + k;
275 valsAux(rowStart + lnnz) = q(j, k);
276 lnnz++;
277 }
278 }
279 rows(localRow + 1) = lnnz;
280 nnz += lnnz;
281 }
282
283#if 0
284 printf("R\n");
285 for (int i = 0; i < m; i++) {
286 for (int j = 0; j < n; j++)
287 printf(" %5.3lf ", coarseNS(i,j));
288 printf("\n");
289 }
290
291 printf("Q\n");
292 for (int i = 0; i < aggSize; i++) {
293 for (int j = 0; j < aggSize; j++)
294 printf(" %5.3lf ", q(i,j));
295 printf("\n");
296 }
297#endif
298 } else {
300 // "no-QR" option //
302 // Local Q factor is just the fine nullspace support over the current aggregate.
303 // Local R factor is the identity.
304 // TODO I have not implemented any special handling for aggregates that are too
305 // TODO small to locally support the nullspace, as is done in the standard QR
306 // TODO case above.
307
308 for (int j = 0; j < m; j++) {
309 LO localRow = agg2RowMapLO(aggRows(agg) + j);
310 size_t rowStart = rowsAux(localRow);
311 size_t lnnz = 0;
312 for (int k = 0; k < n; k++) {
313 const impl_SC qr_jk = fineNS(localRow, k);
314 // skip zeros
315 if (qr_jk != zero) {
316 colsAux(rowStart + lnnz) = offset + k;
317 valsAux(rowStart + lnnz) = qr_jk;
318 lnnz++;
319 }
320 }
321 rows(localRow + 1) = lnnz;
322 nnz += lnnz;
323 }
324
325 for (int j = 0; j < n; j++)
326 coarseNS(offset + j, j) = one;
327 }
328 }
329};
330
331} // namespace
332
333template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
335 RCP<ParameterList> validParamList = rcp(new ParameterList());
336
337#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
338 SET_VALID_ENTRY("tentative: calculate qr");
339 SET_VALID_ENTRY("tentative: build coarse coordinates");
340#undef SET_VALID_ENTRY
341
342 validParamList->set<RCP<const FactoryBase>>("A", Teuchos::null, "Generating factory of the matrix A");
343 validParamList->set<RCP<const FactoryBase>>("Aggregates", Teuchos::null, "Generating factory of the aggregates");
344 validParamList->set<RCP<const FactoryBase>>("Nullspace", Teuchos::null, "Generating factory of the nullspace");
345 validParamList->set<RCP<const FactoryBase>>("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
346 validParamList->set<RCP<const FactoryBase>>("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
347 validParamList->set<RCP<const FactoryBase>>("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
348 validParamList->set<RCP<const FactoryBase>>("Coordinates", Teuchos::null, "Generating factory of the coordinates");
349 validParamList->set<RCP<const FactoryBase>>("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
350
351 // Make sure we don't recursively validate options for the matrixmatrix kernels
352 ParameterList norecurse;
353 norecurse.disableRecursiveValidation();
354 validParamList->set<ParameterList>("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
355
356 return validParamList;
357}
358
359template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
361 const ParameterList& pL = GetParameterList();
362 // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
363 std::string nspName = "Nullspace";
364 if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
365
366 Input(fineLevel, "A");
367 Input(fineLevel, "Aggregates");
368 Input(fineLevel, nspName);
369 Input(fineLevel, "UnAmalgamationInfo");
370 Input(fineLevel, "CoarseMap");
371 if (fineLevel.GetLevelID() == 0 &&
372 fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
373 pL.get<bool>("tentative: build coarse coordinates")) { // and we want coordinates on other levels
374 bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
375 Input(fineLevel, "Coordinates");
376 } else if (bTransferCoordinates_) {
377 Input(fineLevel, "Coordinates");
378 }
379}
380
381template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
383 return BuildP(fineLevel, coarseLevel);
384}
385
386template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
388 FactoryMonitor m(*this, "Build", coarseLevel);
389
390 typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
391 typedef Xpetra::MultiVectorFactory<coordinate_type, LO, GO, NO> RealValuedMultiVectorFactory;
392 const ParameterList& pL = GetParameterList();
393 std::string nspName = "Nullspace";
394 if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
395
396 auto A = Get<RCP<Matrix>>(fineLevel, "A");
397 auto aggregates = Get<RCP<Aggregates>>(fineLevel, "Aggregates");
398 auto amalgInfo = Get<RCP<AmalgamationInfo>>(fineLevel, "UnAmalgamationInfo");
399 auto fineNullspace = Get<RCP<MultiVector>>(fineLevel, nspName);
400 auto coarseMap = Get<RCP<const Map>>(fineLevel, "CoarseMap");
401 RCP<RealValuedMultiVector> fineCoords;
402 if (bTransferCoordinates_) {
403 fineCoords = Get<RCP<RealValuedMultiVector>>(fineLevel, "Coordinates");
404 }
405
406 RCP<Matrix> Ptentative;
407 // No coarse DoFs so we need to bail by setting Ptentattive to null and returning
408 // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize()
409 if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
410 Ptentative = Teuchos::null;
411 Set(coarseLevel, "P", Ptentative);
412 return;
413 }
414 RCP<MultiVector> coarseNullspace;
415 RCP<RealValuedMultiVector> coarseCoords;
416
417 if (bTransferCoordinates_) {
418 RCP<const Map> coarseCoordMap;
419
420 LO blkSize = 1;
421 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
422 blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
423
424 if (blkSize == 1) {
425 // Scalar system
426 // No amalgamation required, we can use the coarseMap
427 coarseCoordMap = coarseMap;
428 } else {
429 // Vector system
430 AmalgamationFactory<SC, LO, GO, NO>::AmalgamateMap(rcp_dynamic_cast<const StridedMap>(coarseMap), coarseCoordMap);
431 }
432
433 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors(), false);
434
435 // Create overlapped fine coordinates to reduce global communication
436 auto uniqueMap = fineCoords->getMap();
437 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
438 if (aggregates->AggregatesCrossProcessors()) {
439 auto nonUniqueMap = aggregates->GetMap();
440 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
441
442 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors(), false);
443 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
444 }
445
446 // The good new is that his graph has already been constructed for the
447 // TentativePFactory and was cached in Aggregates. So this is a no-op.
448 auto aggGraph = aggregates->GetGraph();
449 auto numAggs = aggGraph.numRows();
450
451 auto fineCoordsView = fineCoords->getLocalViewDevice(Tpetra::Access::ReadOnly);
452 auto coarseCoordsView = coarseCoords->getLocalViewDevice(Tpetra::Access::OverwriteAll);
453
454 // Fill in coarse coordinates
455 {
456 SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
457
458 const auto dim = fineCoords->getNumVectors();
459
460 typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
461 for (size_t j = 0; j < dim; j++) {
462 Kokkos::parallel_for(
463 "MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
464 KOKKOS_LAMBDA(const LO i) {
465 // A row in this graph represents all node ids in the aggregate
466 // Therefore, averaging is very easy
467
468 auto aggregate = aggGraph.rowConst(i);
469
470 coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
471 for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
472 sum += fineCoordsRandomView(aggregate(colID), j);
473
474 coarseCoordsView(i, j) = sum / aggregate.length;
475 });
476 }
477 }
478 }
479
480 if (!aggregates->AggregatesCrossProcessors()) {
481 if (Xpetra::Helpers<SC, LO, GO, NO>::isTpetraBlockCrs(A)) {
482 BuildPuncoupledBlockCrs(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
483 coarseLevel.GetLevelID());
484 } else {
485 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
486 }
487 } else
488 BuildPcoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
489
490 // If available, use striding information of fine level matrix A for range
491 // map and coarseMap as domain map; otherwise use plain range map of
492 // Ptent = plain range map of A for range map and coarseMap as domain map.
493 // NOTE:
494 // The latter is not really safe, since there is no striding information
495 // for the range map. This is not really a problem, since striding
496 // information is always available on the intermedium levels and the
497 // coarsest levels.
498 if (A->IsView("stridedMaps") == true)
499 Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
500 else
501 Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
502
503 if (bTransferCoordinates_) {
504 Set(coarseLevel, "Coordinates", coarseCoords);
505 }
506
507 // FIXME: We should remove the NodeComm on levels past the threshold
508 if (fineLevel.IsAvailable("Node Comm")) {
509 RCP<const Teuchos::Comm<int>> nodeComm = Get<RCP<const Teuchos::Comm<int>>>(fineLevel, "Node Comm");
510 Set<RCP<const Teuchos::Comm<int>>>(coarseLevel, "Node Comm", nodeComm);
511 }
512
513 Set(coarseLevel, "Nullspace", coarseNullspace);
514 Set(coarseLevel, "P", Ptentative);
515
516 if (IsPrint(Statistics2)) {
517 RCP<ParameterList> params = rcp(new ParameterList());
518 params->set("printLoadBalancingInfo", true);
519 GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
520 }
521}
522
523template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
525 BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
526 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
527 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
528 RCP<MultiVector>& coarseNullspace, const int levelID) const {
529 auto rowMap = A->getRowMap();
530 auto colMap = A->getColMap();
531
532 const size_t numRows = rowMap->getLocalNumElements();
533 const size_t NSDim = fineNullspace->getNumVectors();
534
535 typedef KokkosKernels::ArithTraits<SC> ATS;
536 using impl_SC = typename ATS::val_type;
537 using impl_ATS = KokkosKernels::ArithTraits<impl_SC>;
538 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
539
540 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
541
542 typename Aggregates::local_graph_type aggGraph;
543 {
544 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
545 aggGraph = aggregates->GetGraph();
546 }
547 auto aggRows = aggGraph.row_map;
548 auto aggCols = aggGraph.entries;
549
550 // Aggregates map is based on the amalgamated column map
551 // We can skip global-to-local conversion if LIDs in row map are
552 // same as LIDs in column map
553 bool goodMap;
554 {
555 SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
556 goodMap = isGoodMap(*rowMap, *colMap);
557 }
558 // FIXME_KOKKOS: need to proofread later code for bad maps
559 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
560 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
561 "(i.e. \"matching\" row and column maps)");
562
563 // STEP 1: do unamalgamation
564 // The non-kokkos version uses member functions from the AmalgamationInfo
565 // container class to unamalgamate the data. In contrast, the kokkos
566 // version of TentativePFactory does the unamalgamation here and only uses
567 // the data of the AmalgamationInfo container class
568
569 // Extract information for unamalgamation
570 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
571 GO indexBase;
572 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
573 GO globalOffset = amalgInfo->GlobalOffset();
574
575 // Extract aggregation info (already in Kokkos host views)
576 auto procWinner = aggregates->GetProcWinner()->getLocalViewDevice(Tpetra::Access::ReadOnly);
577 auto vertex2AggId = aggregates->GetVertex2AggId()->getLocalViewDevice(Tpetra::Access::ReadOnly);
578 const size_t numAggregates = aggregates->GetNumAggregates();
579
580 int myPID = aggregates->GetMap()->getComm()->getRank();
581
582 // Create Kokkos::View (on the device) to store the aggreate dof sizes
583 // Later used to get aggregate dof offsets
584 // NOTE: This zeros itself on construction
585 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
586 AggSizeType aggDofSizes;
587
588 if (stridedBlockSize == 1) {
589 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
590
591 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
592 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
593
594 auto sizesConst = aggregates->ComputeAggregateSizes();
595 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), sizesConst);
596
597 } else {
598 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
599
600 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
601 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
602
603 auto nodeMap = aggregates->GetMap()->getLocalMap();
604 auto dofMap = colMap->getLocalMap();
605
606 Kokkos::parallel_for(
607 "MueLu:TentativePF:Build:compute_agg_sizes", range_type(0, numAggregates),
608 KOKKOS_LAMBDA(const LO agg) {
609 auto aggRowView = aggGraph.rowConst(agg);
610
611 size_t size = 0;
612 for (LO colID = 0; colID < aggRowView.length; colID++) {
613 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
614
615 for (LO k = 0; k < stridedBlockSize; k++) {
616 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
617
618 if (dofMap.getLocalElement(dofGID) != INVALID)
619 size++;
620 }
621 }
622 aggDofSizes(agg + 1) = size;
623 });
624 }
625
626 // Find maximum dof size for aggregates
627 // Later used to reserve enough scratch space for local QR decompositions
628 LO maxAggSize = 0;
629 ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
630 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
631
632 // parallel_scan (exclusive)
633 // The aggDofSizes View then contains the aggregate dof offsets
634 Kokkos::parallel_scan(
635 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
636 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
637 update += aggDofSizes(i);
638 if (final_pass)
639 aggDofSizes(i) = update;
640 });
641
642 // Create Kokkos::View on the device to store mapping
643 // between (local) aggregate id and row map ids (LIDs)
644 Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
645 {
646 SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
647
648 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
649 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
650
651 Kokkos::parallel_for(
652 "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
653 KOKKOS_LAMBDA(const LO lnode) {
654 if (procWinner(lnode, 0) == myPID) {
655 // No need for atomics, it's one-to-one
656 auto aggID = vertex2AggId(lnode, 0);
657
658 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
659 // FIXME: I think this may be wrong
660 // We unconditionally add the whole block here. When we calculated
661 // aggDofSizes, we did the isLocalElement check. Something's fishy.
662 for (LO k = 0; k < stridedBlockSize; k++)
663 agg2RowMapLO(offset + k) = lnode * stridedBlockSize + k;
664 }
665 });
666 }
667
668 // STEP 2: prepare local QR decomposition
669 // Reserve memory for tentative prolongation operator
670 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim, true);
671
672 // Pull out the nullspace vectors so that we can have random access (on the device)
673 auto fineNS = fineNullspace->getLocalViewDevice(Tpetra::Access::ReadWrite);
674 auto coarseNS = coarseNullspace->getLocalViewDevice(Tpetra::Access::ReadWrite);
675
676 size_t nnz = 0; // actual number of nnz
677
678 typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_device_type local_matrix_type;
679 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
680 typedef typename local_matrix_type::index_type::non_const_type cols_type;
681 typedef typename local_matrix_type::values_type::non_const_type vals_type;
682
683 // Device View for status (error messages...)
684 typedef Kokkos::View<int[10], DeviceType> status_type;
685 status_type status("status");
686
687 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
689
690 const ParameterList& pL = GetParameterList();
691 const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
692 if (!doQRStep) {
693 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
694 if (NSDim > 1)
695 GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
696 }
697
698 size_t nnzEstimate = numRows * NSDim;
699 rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows + 1);
700 cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
701 vals_type valsAux("Ptent_aux_vals", nnzEstimate);
702 rows_type rows("Ptent_rows", numRows + 1);
703 {
704 // Stage 0: fill in views.
705 SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
706
707 // The main thing to notice is initialization of vals with INVALID. These
708 // values will later be used to compress the arrays
709 Kokkos::parallel_for(
710 "MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows + 1),
711 KOKKOS_LAMBDA(const LO row) {
712 rowsAux(row) = row * NSDim;
713 });
714 Kokkos::parallel_for(
715 "MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
716 KOKKOS_LAMBDA(const LO j) {
717 colsAux(j) = INVALID;
718 });
719 }
720
721 if (NSDim == 1) {
722 // 1D is special, as it is the easiest. We don't even need to the QR,
723 // just normalize an array. Plus, no worries abot small aggregates. In
724 // addition, we do not worry about compression. It is unlikely that
725 // nullspace will have zeros. If it does, a prolongator row would be
726 // zero and we'll get singularity anyway.
727 SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
728
729 // Set up team policy with numAggregates teams and one thread per team.
730 // Each team handles a slice of the data associated with one aggregate
731 // and performs a local QR decomposition (in this case real QR is
732 // unnecessary).
733 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
734
735 if (doQRStep) {
736 Kokkos::parallel_for(
737 "MueLu:TentativePF:BuildUncoupled:main_loop", policy,
738 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
739 auto agg = thread.league_rank();
740
741 // size of the aggregate (number of DOFs in aggregate)
742 LO aggSize = aggRows(agg + 1) - aggRows(agg);
743
744 // Extract the piece of the nullspace corresponding to the aggregate, and
745 // put it in the flat array, "localQR" (in column major format) for the
746 // QR routine. Trivial in 1D.
747 auto norm = impl_ATS::magnitude(zero);
748
749 // Calculate QR by hand
750 // FIXME: shouldn't there be stridedblock here?
751 // FIXME_KOKKOS: shouldn't there be stridedblock here?
752 for (decltype(aggSize) k = 0; k < aggSize; k++) {
753 auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0));
754 norm += dnorm * dnorm;
755 }
756 norm = sqrt(norm);
757
758 if (norm == zero) {
759 // zero column; terminate the execution
760 statusAtomic(1) = true;
761 return;
762 }
763
764 // R = norm
765 coarseNS(agg, 0) = norm;
766
767 // Q = localQR(:,0)/norm
768 for (decltype(aggSize) k = 0; k < aggSize; k++) {
769 LO localRow = agg2RowMapLO(aggRows(agg) + k);
770 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0) / norm;
771
772 rows(localRow + 1) = 1;
773 colsAux(localRow) = agg;
774 valsAux(localRow) = localVal;
775 }
776 });
777
778 typename status_type::host_mirror_type statusHost = Kokkos::create_mirror_view(status);
779 Kokkos::deep_copy(statusHost, status);
780 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
781 if (statusHost(i)) {
782 std::ostringstream oss;
783 oss << "MueLu::TentativePFactory::MakeTentative: ";
784 switch (i) {
785 case 0: oss << "!goodMap is not implemented"; break;
786 case 1: oss << "fine level NS part has a zero column"; break;
787 }
788 throw Exceptions::RuntimeError(oss.str());
789 }
790
791 } else {
792 Kokkos::parallel_for(
793 "MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
794 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
795 auto agg = thread.league_rank();
796
797 // size of the aggregate (number of DOFs in aggregate)
798 LO aggSize = aggRows(agg + 1) - aggRows(agg);
799
800 // R = norm
801 coarseNS(agg, 0) = one;
802
803 // Q = localQR(:,0)/norm
804 for (decltype(aggSize) k = 0; k < aggSize; k++) {
805 LO localRow = agg2RowMapLO(aggRows(agg) + k);
806 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0);
807
808 rows(localRow + 1) = 1;
809 colsAux(localRow) = agg;
810 valsAux(localRow) = localVal;
811 }
812 });
813 }
814
815 Kokkos::parallel_reduce(
816 "MueLu:TentativeP:CountNNZ", range_type(0, numRows + 1),
817 KOKKOS_LAMBDA(const LO i, size_t& nnz_count) {
818 nnz_count += rows(i);
819 },
820 nnz);
821
822 } else { // NSdim > 1
823 // FIXME_KOKKOS: This code branch is completely unoptimized.
824 // Work to do:
825 // - Optimize QR decomposition
826 // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
827 // packing new values in the beginning of each row
828 // We do use auxilary view in this case, so keep a second rows view for
829 // counting nonzeros in rows
830
831 {
832 SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
833 // Set up team policy with numAggregates teams and one thread per team.
834 // Each team handles a slice of the data associated with one aggregate
835 // and performs a local QR decomposition
836 Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1); // numAggregates teams a 1 thread
837 LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
838 decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
839 decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
840 decltype(valsAux)>
841 localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
843 if (doQRStep) {
844 using shared_matrix = typename decltype(localQRFunctor)::shared_matrix;
845 int m = maxAggSize;
846 int n = fineNSRandom.extent(1);
847 int size = shared_matrix::shmem_size(m, n) + // r
848 shared_matrix::shmem_size(m, m); // q
849
850 if (size < policy.scratch_size_max(/*level=*/(int)0))
851 policy.set_scratch_size(/*level=*/(int)0, Kokkos::PerTeam(size));
852 else if (size < policy.scratch_size_max(/*level=*/(int)1))
853 policy.set_scratch_size(/*level=*/(int)1, Kokkos::PerTeam(size));
854 else
855 throw Exceptions::RuntimeError("Neither L0 scratch memory (max size " + std::to_string(policy.scratch_size_max((int)0)) +
856 "), nor L1 scratch memory (max size " + std::to_string(policy.scratch_size_max((int)1)) +
857 ") is large enough for requested allocation of size " + std::to_string(size));
858 }
859 Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
860 }
861
862 typename status_type::host_mirror_type statusHost = Kokkos::create_mirror_view(status);
863 Kokkos::deep_copy(statusHost, status);
864 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
865 if (statusHost(i)) {
866 std::ostringstream oss;
867 oss << "MueLu::TentativePFactory::MakeTentative: ";
868 switch (i) {
869 case 0: oss << "!goodMap is not implemented"; break;
870 case 1: oss << "fine level NS part has a zero column"; break;
871 }
872 throw Exceptions::RuntimeError(oss.str());
873 }
874 }
875
876 // Compress the cols and vals by ignoring INVALID column entries that correspond
877 // to 0 in QR.
878
879 // The real cols and vals are constructed using calculated (not estimated) nnz
880 cols_type cols;
881 vals_type vals;
882
883 if (nnz != nnzEstimate) {
884 {
885 // Stage 2: compress the arrays
886 SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
887
888 Kokkos::parallel_scan(
889 "MueLu:TentativePF:Build:compress_rows", range_type(0, numRows + 1),
890 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
891 upd += rows(i);
892 if (final)
893 rows(i) = upd;
894 });
895 }
896
897 {
898 SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
899
900 cols = cols_type("Ptent_cols", nnz);
901 vals = vals_type("Ptent_vals", nnz);
902
903 // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
904 // to the beginning of rows. See CoalesceDropFactory_kokkos for
905 // example.
906 Kokkos::parallel_for(
907 "MueLu:TentativePF:Build:compress_cols_vals", range_type(0, numRows),
908 KOKKOS_LAMBDA(const LO i) {
909 LO rowStart = rows(i);
910
911 size_t lnnz = 0;
912 for (auto j = rowsAux(i); j < rowsAux(i + 1); j++)
913 if (colsAux(j) != INVALID) {
914 cols(rowStart + lnnz) = colsAux(j);
915 vals(rowStart + lnnz) = valsAux(j);
916 lnnz++;
917 }
918 });
919 }
920
921 } else {
922 rows = rowsAux;
923 cols = colsAux;
924 vals = valsAux;
925 }
926
927 GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
928
929 {
930 // Stage 3: construct Xpetra::Matrix
931 SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
932
933 local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
934
935 // Managing labels & constants for ESFC
936 RCP<ParameterList> FCparams;
937 if (pL.isSublist("matrixmatrix: kernel params"))
938 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
939 else
940 FCparams = rcp(new ParameterList);
941
942 // By default, we don't need global constants for TentativeP
943 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
944 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
945
946 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
947 Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
948 }
949}
950
951template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
953 BuildPuncoupledBlockCrs(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
954 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
955 RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
956 RCP<MultiVector>& coarseNullspace, const int levelID) const {
957 /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could
958 be generalized later, if we ever need to do so:
959 1) Null space dimension === block size of matrix: So no elasticity right now
960 2) QR is not supported: Under assumption #1, this shouldn't cause problems.
961 3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
962
963 These assumptions keep our code way simpler and still support the use cases we actually care about.
964 */
965
966 RCP<const Map> rowMap = A->getRowMap();
967 RCP<const Map> rangeMap = A->getRangeMap();
968 RCP<const Map> colMap = A->getColMap();
969 // const size_t numFinePointRows = rangeMap->getLocalNumElements();
970 const size_t numFineBlockRows = rowMap->getLocalNumElements();
971
972 // typedef Teuchos::ScalarTraits<SC> STS;
973 // typedef typename STS::magnitudeType Magnitude;
974 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
975
976 typedef KokkosKernels::ArithTraits<SC> ATS;
977 using impl_SC = typename ATS::val_type;
978 using impl_ATS = KokkosKernels::ArithTraits<impl_SC>;
979 const impl_SC one = impl_ATS::one();
980
981 // const GO numAggs = aggregates->GetNumAggregates();
982 const size_t NSDim = fineNullspace->getNumVectors();
983 auto aggSizes = aggregates->ComputeAggregateSizes();
984
985 typename Aggregates::local_graph_type aggGraph;
986 {
987 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
988 aggGraph = aggregates->GetGraph();
989 }
990 auto aggRows = aggGraph.row_map;
991 auto aggCols = aggGraph.entries;
992
993 // Need to generate the coarse block map
994 // NOTE: We assume NSDim == block size here
995 // NOTE: We also assume that coarseMap has contiguous GIDs
996 // const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
997 const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
998 RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
999 Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
1000 numCoarseBlockRows,
1001 coarsePointMap->getIndexBase(),
1002 coarsePointMap->getComm());
1003 // Sanity checking
1004 const ParameterList& pL = GetParameterList();
1005 // const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
1006
1007 // The aggregates use the amalgamated column map, which in this case is what we want
1008
1009 // Aggregates map is based on the amalgamated column map
1010 // We can skip global-to-local conversion if LIDs in row map are
1011 // same as LIDs in column map
1012 bool goodMap = MueLu::Utilities<SC, LO, GO, NO>::MapsAreNested(*rowMap, *colMap);
1013 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
1014 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1015 "(i.e. \"matching\" row and column maps)");
1016
1017 // STEP 1: do unamalgamation
1018 // The non-kokkos version uses member functions from the AmalgamationInfo
1019 // container class to unamalgamate the data. In contrast, the kokkos
1020 // version of TentativePFactory does the unamalgamation here and only uses
1021 // the data of the AmalgamationInfo container class
1022
1023 // Extract information for unamalgamation
1024 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1025 GO indexBase;
1026 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1027 // GO globalOffset = amalgInfo->GlobalOffset();
1028
1029 // Extract aggregation info (already in Kokkos host views)
1030 auto procWinner = aggregates->GetProcWinner()->getLocalViewDevice(Tpetra::Access::ReadOnly);
1031 auto vertex2AggId = aggregates->GetVertex2AggId()->getLocalViewDevice(Tpetra::Access::ReadOnly);
1032 const size_t numAggregates = aggregates->GetNumAggregates();
1033
1034 int myPID = aggregates->GetMap()->getComm()->getRank();
1035
1036 // Create Kokkos::View (on the device) to store the aggreate dof sizes
1037 // Later used to get aggregate dof offsets
1038 // NOTE: This zeros itself on construction
1039 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
1040 AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
1041
1042 {
1043 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
1044
1045 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
1046 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
1047
1048 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), aggSizes);
1049 }
1050
1051 // Find maximum dof size for aggregates
1052 // Later used to reserve enough scratch space for local QR decompositions
1053 LO maxAggSize = 0;
1054 ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
1055 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1056
1057 // parallel_scan (exclusive)
1058 // The aggDofSizes View then contains the aggregate dof offsets
1059 Kokkos::parallel_scan(
1060 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
1061 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
1062 update += aggDofSizes(i);
1063 if (final_pass)
1064 aggDofSizes(i) = update;
1065 });
1066
1067 // Create Kokkos::View on the device to store mapping
1068 // between (local) aggregate id and row map ids (LIDs)
1069 Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
1070 {
1071 SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
1072
1073 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
1074 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1075
1076 Kokkos::parallel_for(
1077 "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
1078 KOKKOS_LAMBDA(const LO lnode) {
1079 if (procWinner(lnode, 0) == myPID) {
1080 // No need for atomics, it's one-to-one
1081 auto aggID = vertex2AggId(lnode, 0);
1082
1083 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
1084 // FIXME: I think this may be wrong
1085 // We unconditionally add the whole block here. When we calculated
1086 // aggDofSizes, we did the isLocalElement check. Something's fishy.
1087 for (LO k = 0; k < stridedBlockSize; k++)
1088 aggToRowMapLO(offset + k) = lnode * stridedBlockSize + k;
1089 }
1090 });
1091 }
1092
1093 // STEP 2: prepare local QR decomposition
1094 // Reserve memory for tentative prolongation operator
1095 coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim, true);
1096
1097 // Pull out the nullspace vectors so that we can have random access (on the device)
1098 auto fineNS = fineNullspace->getLocalViewDevice(Tpetra::Access::ReadWrite);
1099 auto coarseNS = coarseNullspace->getLocalViewDevice(Tpetra::Access::ReadWrite);
1100
1101 typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_device_type local_matrix_type;
1102 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1103 typedef typename local_matrix_type::index_type::non_const_type cols_type;
1104 // typedef typename local_matrix_type::values_type::non_const_type vals_type;
1105
1106 // Device View for status (error messages...)
1107 typedef Kokkos::View<int[10], DeviceType> status_type;
1108 status_type status("status");
1109
1110 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
1112
1113 // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
1114 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
1115
1116 // BlockCrs requires that we build the (block) graph first, so let's do that...
1117
1118 // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one
1119 // block non-zero per row in the matrix;
1120 rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1121 cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
1122
1123 Kokkos::parallel_for(
1124 "MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
1125 KOKKOS_LAMBDA(const LO j) {
1126 ia[j] = j;
1127 ja[j] = INVALID;
1128
1129 if (j == (LO)numFineBlockRows - 1)
1130 ia[numFineBlockRows] = numFineBlockRows;
1131 });
1132
1133 // Fill Graph
1134 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1135 Kokkos::parallel_for(
1136 "MueLu:TentativePF:BlockCrs:fillGraph", policy,
1137 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1138 auto agg = thread.league_rank();
1139 Xpetra::global_size_t offset = agg;
1140
1141 // size of the aggregate (number of DOFs in aggregate)
1142 LO aggSize = aggRows(agg + 1) - aggRows(agg);
1143
1144 for (LO j = 0; j < aggSize; j++) {
1145 // FIXME: Allow for bad maps
1146 const LO localRow = aggToRowMapLO[aggDofSizes[agg] + j];
1147 const size_t rowStart = ia[localRow];
1148 ja[rowStart] = offset;
1149 }
1150 });
1151
1152 // Compress storage (remove all INVALID, which happen when we skip zeros)
1153 // We do that in-place
1154 {
1155 // Stage 2: compress the arrays
1156 SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
1157 // Fill i_temp with the correct row starts
1158 rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1159 LO nnz = 0;
1160 Kokkos::parallel_scan(
1161 "MueLu:TentativePF:BlockCrs:compress_rows", range_type(0, numFineBlockRows),
1162 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
1163 if (final)
1164 i_temp[i] = upd;
1165 for (auto j = ia[i]; j < ia[i + 1]; j++)
1166 if (ja[j] != INVALID)
1167 upd++;
1168 if (final && i == (LO)numFineBlockRows - 1)
1169 i_temp[numFineBlockRows] = upd;
1170 },
1171 nnz);
1172
1173 cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
1174
1175 Kokkos::parallel_for(
1176 "MueLu:TentativePF:BlockCrs:compress_cols", range_type(0, numFineBlockRows),
1177 KOKKOS_LAMBDA(const LO i) {
1178 size_t rowStart = i_temp[i];
1179 size_t lnnz = 0;
1180 for (auto j = ia[i]; j < ia[i + 1]; j++)
1181 if (ja[j] != INVALID) {
1182 j_temp[rowStart + lnnz] = ja[j];
1183 lnnz++;
1184 }
1185 });
1186
1187 ia = i_temp;
1188 ja = j_temp;
1189 }
1190
1191 RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap, coarseBlockMap, ia, ja);
1192
1193 // Managing labels & constants for ESFC
1194 {
1195 RCP<ParameterList> FCparams;
1196 if (pL.isSublist("matrixmatrix: kernel params"))
1197 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
1198 else
1199 FCparams = rcp(new ParameterList);
1200 // By default, we don't need global constants for TentativeP
1201 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
1202 std::string levelIDs = toString(levelID);
1203 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + levelIDs);
1204 RCP<const Export> dummy_e;
1205 RCP<const Import> dummy_i;
1206 BlockGraph->expertStaticFillComplete(coarseBlockMap, rowMap, dummy_i, dummy_e, FCparams);
1207 }
1208
1209 // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
1210 // we clear them here
1211 ia = rows_type();
1212 ja = cols_type();
1213
1214 // Now let's make a BlockCrs Matrix
1215 // NOTE: Assumes block size== NSDim
1216 RCP<Xpetra::CrsMatrix<SC, LO, GO, NO>> P_xpetra = Xpetra::CrsMatrixFactory<SC, LO, GO, NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap, NSDim);
1217 RCP<Xpetra::TpetraBlockCrsMatrix<SC, LO, GO, NO>> P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC, LO, GO, NO>>(P_xpetra);
1218 if (P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1219 RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
1220
1221 auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1222 const LO stride = NSDim * NSDim;
1223
1224 Kokkos::parallel_for(
1225 "MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1226 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1227 auto agg = thread.league_rank();
1228
1229 // size of the aggregate (number of DOFs in aggregate)
1230 LO aggSize = aggRows(agg + 1) - aggRows(agg);
1231 Xpetra::global_size_t offset = agg * NSDim;
1232
1233 // Q = localQR(:,0)/norm
1234 for (LO j = 0; j < aggSize; j++) {
1235 LO localBlockRow = aggToRowMapLO(aggRows(agg) + j);
1236 LO rowStart = localBlockRow * stride;
1237 for (LO r = 0; r < (LO)NSDim; r++) {
1238 LO localPointRow = localBlockRow * NSDim + r;
1239 for (LO c = 0; c < (LO)NSDim; c++) {
1240 values[rowStart + r * NSDim + c] = fineNSRandom(localPointRow, c);
1241 }
1242 }
1243 }
1244
1245 // R = norm
1246 for (LO j = 0; j < (LO)NSDim; j++)
1247 coarseNS(offset + j, j) = one;
1248 });
1249
1250 Ptentative = P_wrap;
1251}
1252
1253template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1255 BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates> /* aggregates */,
1256 RCP<AmalgamationInfo> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
1257 RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
1258 RCP<MultiVector>& /* coarseNullspace */) const {
1259 throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
1260}
1261
1262template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1264 isGoodMap(const Map& rowMap, const Map& colMap) const {
1265 auto rowLocalMap = rowMap.getLocalMap();
1266 auto colLocalMap = colMap.getLocalMap();
1267
1268 const size_t numRows = rowLocalMap.getLocalNumElements();
1269 const size_t numCols = colLocalMap.getLocalNumElements();
1270
1271 if (numCols < numRows)
1272 return false;
1273
1274 size_t numDiff = 0;
1275 Kokkos::parallel_reduce(
1276 "MueLu:TentativePF:isGoodMap", range_type(0, numRows),
1277 KOKKOS_LAMBDA(const LO i, size_t& diff) {
1278 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1279 },
1280 numDiff);
1281
1282 return (numDiff == 0);
1283}
1284
1285} // namespace MueLu
1286
1287#define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1288#endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
#define SET_VALID_ENTRY(name)
maxAggDofSizeType maxAggDofSize
agg2RowMapLOType agg2RowMapLO
MueLu::DefaultLocalOrdinal LocalOrdinal
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
typename LWGraph_kokkos::local_graph_type local_graph_type
static void AmalgamateMap(const Map &sourceMap, const Matrix &A, RCP< const Map > &amalgamatedMap, Array< LO > &translation)
Method to create merged map for systems of PDEs.
Exception throws to report errors in the internal logical of the program.
Timer to be used in factories. Similar to Monitor but with additional timers.
Class that holds all level-specific information.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need's value has been saved.
int GetLevelID() const
Return level number.
static const NoFactory * get()
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
bool isGoodMap(const Map &rowMap, const Map &colMap) const
void BuildP(Level &fineLevel, Level &coarseLevel) const
Abstract Build method.
void BuildPuncoupledBlockCrs(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
void BuildPuncoupled(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
void Build(Level &fineLevel, Level &coarseLevel) const
Build an object with this factory.
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
void DeclareInput(Level &fineLevel, Level &coarseLevel) const
Input.
void BuildPcoupled(RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace) const
static bool MapsAreNested(const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &rowMap, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &colMap)
Namespace for MueLu classes and methods.
@ Warnings0
Important warning messages (one line)
@ Statistics2
Print even more statistics.
@ Runtime1
Description of what is happening (more verbose)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.