MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// MueLu: A package for multigrid based preconditioning
4//
5// Copyright 2012 NTESS and the MueLu contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
11#define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
12
13#include "Kokkos_UnorderedMap.hpp"
14#include "Xpetra_CrsGraphFactory.hpp"
15
17
18#include "MueLu_Aggregates.hpp"
19#include "MueLu_AmalgamationInfo.hpp"
20#include "MueLu_AmalgamationFactory.hpp"
21
22#include "MueLu_MasterList.hpp"
23#include "MueLu_PerfUtils.hpp"
24#include "MueLu_Monitor.hpp"
25
26#include "Xpetra_IO.hpp"
27
28namespace MueLu {
29
30namespace { // anonymous
31
32template <class LocalOrdinal, class View>
33class ReduceMaxFunctor {
34 public:
35 ReduceMaxFunctor(View view)
36 : view_(view) {}
37
38 KOKKOS_INLINE_FUNCTION
39 void operator()(const LocalOrdinal& i, LocalOrdinal& vmax) const {
40 if (vmax < view_(i))
41 vmax = view_(i);
42 }
43
44 KOKKOS_INLINE_FUNCTION
45 void join(LocalOrdinal& dst, const LocalOrdinal& src) const {
46 if (dst < src) {
47 dst = src;
48 }
49 }
50
51 KOKKOS_INLINE_FUNCTION
52 void init(LocalOrdinal& dst) const {
53 dst = 0;
54 }
55
56 private:
58};
59
60// local QR decomposition
61template <class LOType, class GOType, class SCType, class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
62class LocalQRDecompFunctor {
63 private:
64 typedef LOType LO;
65 typedef GOType GO;
66 typedef SCType SC;
67
68 typedef typename DeviceType::execution_space execution_space;
69#if KOKKOS_VERSION >= 40799
70 typedef typename KokkosKernels::ArithTraits<SC>::val_type impl_SC;
71#else
72 typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
73#endif
74#if KOKKOS_VERSION >= 40799
75 typedef KokkosKernels::ArithTraits<impl_SC> impl_ATS;
76#else
77 typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
78#endif
79 typedef typename impl_ATS::magnitudeType Magnitude;
80
81 typedef Kokkos::View<impl_SC**, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
82 typedef Kokkos::View<impl_SC*, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
83
84 private:
85 NspType fineNS;
86 NspType coarseNS;
87 aggRowsType aggRows;
88 maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
89 agg2RowMapLOType agg2RowMapLO;
90 statusType statusAtomic;
91 rowsType rows;
92 rowsAuxType rowsAux;
93 colsAuxType colsAux;
94 valsAuxType valsAux;
96
97 public:
98 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_)
99 : fineNS(fineNS_)
100 , coarseNS(coarseNS_)
101 , aggRows(aggRows_)
102 , maxAggDofSize(maxAggDofSize_)
103 , agg2RowMapLO(agg2RowMapLO_)
104 , statusAtomic(statusAtomic_)
105 , rows(rows_)
106 , rowsAux(rowsAux_)
107 , colsAux(colsAux_)
108 , valsAux(valsAux_)
109 , doQRStep(doQRStep_) {}
110
111 KOKKOS_INLINE_FUNCTION
112 void operator()(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread, size_t& nnz) const {
113 auto agg = thread.league_rank();
114
115 // size of aggregate: number of DOFs in aggregate
116 auto aggSize = aggRows(agg + 1) - aggRows(agg);
117
118 const impl_SC one = impl_ATS::one();
119 const impl_SC two = one + one;
120 const impl_SC zero = impl_ATS::zero();
121 const auto zeroM = impl_ATS::magnitude(zero);
122
123 int m = aggSize;
124 int n = fineNS.extent(1);
125
126 // calculate row offset for coarse nullspace
127 Xpetra::global_size_t offset = agg * n;
128
129 if (doQRStep) {
130 // Extract the piece of the nullspace corresponding to the aggregate
131 shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
132 for (int j = 0; j < n; j++)
133 for (int k = 0; k < m; k++)
134 r(k, j) = fineNS(agg2RowMapLO(aggRows(agg) + k), j);
135#if 0
136 printf("A\n");
137 for (int i = 0; i < m; i++) {
138 for (int j = 0; j < n; j++)
139 printf(" %5.3lf ", r(i,j));
140 printf("\n");
141 }
142#endif
143
144 // Calculate QR decomposition (standard)
145 shared_matrix q(thread.team_shmem(), m, m); // Q
146 if (m >= n) {
147 bool isSingular = false;
148
149 // Initialize Q^T
150 auto qt = q;
151 for (int i = 0; i < m; i++) {
152 for (int j = 0; j < m; j++)
153 qt(i, j) = zero;
154 qt(i, i) = one;
155 }
156
157 for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
158 // FIXME_KOKKOS: use team
159 Magnitude s = zeroM, norm, norm_x;
160 for (int i = k + 1; i < m; i++)
161 s += pow(impl_ATS::magnitude(r(i, k)), 2);
162 norm = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
163
164 if (norm == zero) {
165 isSingular = true;
166 break;
167 }
168
169 r(k, k) -= norm * one;
170
171 norm_x = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
172 if (norm_x == zeroM) {
173 // We have a single diagonal element in the column.
174 // No reflections required. Just need to restor r(k,k).
175 r(k, k) = norm * one;
176 continue;
177 }
178
179 // FIXME_KOKKOS: use team
180 for (int i = k; i < m; i++)
181 r(i, k) /= norm_x;
182
183 // Update R(k:m,k+1:n)
184 for (int j = k + 1; j < n; j++) {
185 // FIXME_KOKKOS: use team in the loops
186 impl_SC si = zero;
187 for (int i = k; i < m; i++)
188 si += r(i, k) * r(i, j);
189 for (int i = k; i < m; i++)
190 r(i, j) -= two * si * r(i, k);
191 }
192
193 // Update Q^T (k:m,k:m)
194 for (int j = k; j < m; j++) {
195 // FIXME_KOKKOS: use team in the loops
196 impl_SC si = zero;
197 for (int i = k; i < m; i++)
198 si += r(i, k) * qt(i, j);
199 for (int i = k; i < m; i++)
200 qt(i, j) -= two * si * r(i, k);
201 }
202
203 // Fix R(k:m,k)
204 r(k, k) = norm * one;
205 for (int i = k + 1; i < m; i++)
206 r(i, k) = zero;
207 }
208
209#if 0
210 // Q = (Q^T)^T
211 for (int i = 0; i < m; i++)
212 for (int j = 0; j < i; j++) {
213 impl_SC tmp = qt(i,j);
214 qt(i,j) = qt(j,i);
215 qt(j,i) = tmp;
216 }
217#endif
218
219 // Build coarse nullspace using the upper triangular part of R
220 for (int j = 0; j < n; j++)
221 for (int k = 0; k <= j; k++)
222 coarseNS(offset + k, j) = r(k, j);
223
224 if (isSingular) {
225 statusAtomic(1) = true;
226 return;
227 }
228
229 } else {
230 // Special handling for m < n (i.e. single node aggregates in structural mechanics)
231
232 // The local QR decomposition is not possible in the "overconstrained"
233 // case (i.e. number of columns in qr > number of rowsAux), which
234 // corresponds to #DOFs in Aggregate < n. For usual problems this
235 // is only possible for single node aggregates in structural mechanics.
236 // (Similar problems may arise in discontinuous Galerkin problems...)
237 // We bypass the QR decomposition and use an identity block in the
238 // tentative prolongator for the single node aggregate and transfer the
239 // corresponding fine level null space information 1-to-1 to the coarse
240 // level null space part.
241
242 // NOTE: The resulting tentative prolongation operator has
243 // (m*DofsPerNode-n) zero columns leading to a singular
244 // coarse level operator A. To deal with that one has the following
245 // options:
246 // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
247 // false) to add some identity block to the diagonal of the zero rowsAux
248 // in the coarse level operator A, such that standard level smoothers
249 // can be used again.
250 // - Use special (projection-based) level smoothers, which can deal
251 // with singular matrices (very application specific)
252 // - Adapt the code below to avoid zero columns. However, we do not
253 // support a variable number of DOFs per node in MueLu/Xpetra which
254 // makes the implementation really hard.
255 //
256 // FIXME: do we need to check for singularity here somehow? Zero
257 // columns would be easy but linear dependency would require proper QR.
258
259 // R = extended (by adding identity rowsAux) qr
260 for (int j = 0; j < n; j++)
261 for (int k = 0; k < n; k++)
262 if (k < m)
263 coarseNS(offset + k, j) = r(k, j);
264 else
265 coarseNS(offset + k, j) = (k == j ? one : zero);
266
267 // Q = I (rectangular)
268 for (int i = 0; i < m; i++)
269 for (int j = 0; j < n; j++)
270 q(i, j) = (j == i ? one : zero);
271 }
272
273 // Process each row in the local Q factor and fill helper arrays to assemble P
274 for (int j = 0; j < m; j++) {
275 LO localRow = agg2RowMapLO(aggRows(agg) + j);
276 size_t rowStart = rowsAux(localRow);
277 size_t lnnz = 0;
278 for (int k = 0; k < n; k++) {
279 // skip zeros
280 if (q(j, k) != zero) {
281 colsAux(rowStart + lnnz) = offset + k;
282 valsAux(rowStart + lnnz) = q(j, k);
283 lnnz++;
284 }
285 }
286 rows(localRow + 1) = lnnz;
287 nnz += lnnz;
288 }
289
290#if 0
291 printf("R\n");
292 for (int i = 0; i < m; i++) {
293 for (int j = 0; j < n; j++)
294 printf(" %5.3lf ", coarseNS(i,j));
295 printf("\n");
296 }
297
298 printf("Q\n");
299 for (int i = 0; i < aggSize; i++) {
300 for (int j = 0; j < aggSize; j++)
301 printf(" %5.3lf ", q(i,j));
302 printf("\n");
303 }
304#endif
305 } else {
307 // "no-QR" option //
309 // Local Q factor is just the fine nullspace support over the current aggregate.
310 // Local R factor is the identity.
311 // TODO I have not implemented any special handling for aggregates that are too
312 // TODO small to locally support the nullspace, as is done in the standard QR
313 // TODO case above.
314
315 for (int j = 0; j < m; j++) {
316 LO localRow = agg2RowMapLO(aggRows(agg) + j);
317 size_t rowStart = rowsAux(localRow);
318 size_t lnnz = 0;
319 for (int k = 0; k < n; k++) {
320 const impl_SC qr_jk = fineNS(localRow, k);
321 // skip zeros
322 if (qr_jk != zero) {
323 colsAux(rowStart + lnnz) = offset + k;
324 valsAux(rowStart + lnnz) = qr_jk;
325 lnnz++;
326 }
327 }
328 rows(localRow + 1) = lnnz;
329 nnz += lnnz;
330 }
331
332 for (int j = 0; j < n; j++)
333 coarseNS(offset + j, j) = one;
334 }
335 }
336
337 // amount of shared memory
338 size_t team_shmem_size(int /* team_size */) const {
339 if (doQRStep) {
340 int m = maxAggDofSize;
341 int n = fineNS.extent(1);
342 return shared_matrix::shmem_size(m, n) + // r
343 shared_matrix::shmem_size(m, m); // q
344 } else
345 return 0;
346 }
347};
348
349} // namespace
350
351template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
353 RCP<ParameterList> validParamList = rcp(new ParameterList());
354
355#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
356 SET_VALID_ENTRY("tentative: calculate qr");
357 SET_VALID_ENTRY("tentative: build coarse coordinates");
358#undef SET_VALID_ENTRY
359
360 validParamList->set<RCP<const FactoryBase>>("A", Teuchos::null, "Generating factory of the matrix A");
361 validParamList->set<RCP<const FactoryBase>>("Aggregates", Teuchos::null, "Generating factory of the aggregates");
362 validParamList->set<RCP<const FactoryBase>>("Nullspace", Teuchos::null, "Generating factory of the nullspace");
363 validParamList->set<RCP<const FactoryBase>>("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
364 validParamList->set<RCP<const FactoryBase>>("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
365 validParamList->set<RCP<const FactoryBase>>("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
366 validParamList->set<RCP<const FactoryBase>>("Coordinates", Teuchos::null, "Generating factory of the coordinates");
367 validParamList->set<RCP<const FactoryBase>>("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
368
369 // Make sure we don't recursively validate options for the matrixmatrix kernels
370 ParameterList norecurse;
371 norecurse.disableRecursiveValidation();
372 validParamList->set<ParameterList>("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
373
374 return validParamList;
375}
376
377template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
379 const ParameterList& pL = GetParameterList();
380 // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
381 std::string nspName = "Nullspace";
382 if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
383
384 Input(fineLevel, "A");
385 Input(fineLevel, "Aggregates");
386 Input(fineLevel, nspName);
387 Input(fineLevel, "UnAmalgamationInfo");
388 Input(fineLevel, "CoarseMap");
389 if (fineLevel.GetLevelID() == 0 &&
390 fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
391 pL.get<bool>("tentative: build coarse coordinates")) { // and we want coordinates on other levels
392 bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
393 Input(fineLevel, "Coordinates");
394 } else if (bTransferCoordinates_) {
395 Input(fineLevel, "Coordinates");
396 }
397}
398
399template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
401 return BuildP(fineLevel, coarseLevel);
402}
403
404template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
406 FactoryMonitor m(*this, "Build", coarseLevel);
407
408 typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
409 typedef Xpetra::MultiVectorFactory<coordinate_type, LO, GO, NO> RealValuedMultiVectorFactory;
410 const ParameterList& pL = GetParameterList();
411 std::string nspName = "Nullspace";
412 if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
413
414 auto A = Get<RCP<Matrix>>(fineLevel, "A");
415 auto aggregates = Get<RCP<Aggregates>>(fineLevel, "Aggregates");
416 auto amalgInfo = Get<RCP<AmalgamationInfo>>(fineLevel, "UnAmalgamationInfo");
417 auto fineNullspace = Get<RCP<MultiVector>>(fineLevel, nspName);
418 auto coarseMap = Get<RCP<const Map>>(fineLevel, "CoarseMap");
419 RCP<RealValuedMultiVector> fineCoords;
420 if (bTransferCoordinates_) {
421 fineCoords = Get<RCP<RealValuedMultiVector>>(fineLevel, "Coordinates");
422 }
423
424 RCP<Matrix> Ptentative;
425 // No coarse DoFs so we need to bail by setting Ptentattive to null and returning
426 // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize()
427 if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
428 Ptentative = Teuchos::null;
429 Set(coarseLevel, "P", Ptentative);
430 return;
431 }
432 RCP<MultiVector> coarseNullspace;
433 RCP<RealValuedMultiVector> coarseCoords;
434
435 if (bTransferCoordinates_) {
436 RCP<const Map> coarseCoordMap;
437
438 LO blkSize = 1;
439 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
440 blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
441
442 if (blkSize == 1) {
443 // Scalar system
444 // No amalgamation required, we can use the coarseMap
445 coarseCoordMap = coarseMap;
446 } else {
447 // Vector system
448 AmalgamationFactory<SC, LO, GO, NO>::AmalgamateMap(rcp_dynamic_cast<const StridedMap>(coarseMap), coarseCoordMap);
449 }
450
451 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
452
453 // Create overlapped fine coordinates to reduce global communication
454 auto uniqueMap = fineCoords->getMap();
455 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
456 if (aggregates->AggregatesCrossProcessors()) {
457 auto nonUniqueMap = aggregates->GetMap();
458 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
459
460 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
461 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
462 }
463
464 // The good new is that his graph has already been constructed for the
465 // TentativePFactory and was cached in Aggregates. So this is a no-op.
466 auto aggGraph = aggregates->GetGraph();
467 auto numAggs = aggGraph.numRows();
468
469 auto fineCoordsView = fineCoords->getLocalViewDevice(Tpetra::Access::ReadOnly);
470 auto coarseCoordsView = coarseCoords->getLocalViewDevice(Tpetra::Access::OverwriteAll);
471
472 // Fill in coarse coordinates
473 {
474 SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
475
476 const auto dim = fineCoords->getNumVectors();
477
478 typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
479 for (size_t j = 0; j < dim; j++) {
480 Kokkos::parallel_for(
481 "MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
482 KOKKOS_LAMBDA(const LO i) {
483 // A row in this graph represents all node ids in the aggregate
484 // Therefore, averaging is very easy
485
486 auto aggregate = aggGraph.rowConst(i);
487
488 coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
489 for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
490 sum += fineCoordsRandomView(aggregate(colID), j);
491
492 coarseCoordsView(i, j) = sum / aggregate.length;
493 });
494 }
495 }
496 }
497
498 if (!aggregates->AggregatesCrossProcessors()) {
499 if (Xpetra::Helpers<SC, LO, GO, NO>::isTpetraBlockCrs(A)) {
500 BuildPuncoupledBlockCrs(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
501 coarseLevel.GetLevelID());
502 } else {
503 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
504 }
505 } else
506 BuildPcoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
507
508 // If available, use striding information of fine level matrix A for range
509 // map and coarseMap as domain map; otherwise use plain range map of
510 // Ptent = plain range map of A for range map and coarseMap as domain map.
511 // NOTE:
512 // The latter is not really safe, since there is no striding information
513 // for the range map. This is not really a problem, since striding
514 // information is always available on the intermedium levels and the
515 // coarsest levels.
516 if (A->IsView("stridedMaps") == true)
517 Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
518 else
519 Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
520
521 if (bTransferCoordinates_) {
522 Set(coarseLevel, "Coordinates", coarseCoords);
523 }
524
525 // FIXME: We should remove the NodeComm on levels past the threshold
526 if (fineLevel.IsAvailable("Node Comm")) {
527 RCP<const Teuchos::Comm<int>> nodeComm = Get<RCP<const Teuchos::Comm<int>>>(fineLevel, "Node Comm");
528 Set<RCP<const Teuchos::Comm<int>>>(coarseLevel, "Node Comm", nodeComm);
529 }
530
531 Set(coarseLevel, "Nullspace", coarseNullspace);
532 Set(coarseLevel, "P", Ptentative);
533
534 if (IsPrint(Statistics2)) {
535 RCP<ParameterList> params = rcp(new ParameterList());
536 params->set("printLoadBalancingInfo", true);
537 GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
538 }
539}
540
541template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
543 BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
544 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
545 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
546 RCP<MultiVector>& coarseNullspace, const int levelID) const {
547 auto rowMap = A->getRowMap();
548 auto colMap = A->getColMap();
549
550 const size_t numRows = rowMap->getLocalNumElements();
551 const size_t NSDim = fineNullspace->getNumVectors();
552
553#if KOKKOS_VERSION >= 40799
554 typedef KokkosKernels::ArithTraits<SC> ATS;
555#else
556 typedef Kokkos::ArithTraits<SC> ATS;
557#endif
558 using impl_SC = typename ATS::val_type;
559#if KOKKOS_VERSION >= 40799
560 using impl_ATS = KokkosKernels::ArithTraits<impl_SC>;
561#else
562 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
563#endif
564 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
565
566 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
567
568 typename Aggregates::local_graph_type aggGraph;
569 {
570 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
571 aggGraph = aggregates->GetGraph();
572 }
573 auto aggRows = aggGraph.row_map;
574 auto aggCols = aggGraph.entries;
575
576 // Aggregates map is based on the amalgamated column map
577 // We can skip global-to-local conversion if LIDs in row map are
578 // same as LIDs in column map
579 bool goodMap;
580 {
581 SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
582 goodMap = isGoodMap(*rowMap, *colMap);
583 }
584 // FIXME_KOKKOS: need to proofread later code for bad maps
585 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
586 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
587 "(i.e. \"matching\" row and column maps)");
588
589 // STEP 1: do unamalgamation
590 // The non-kokkos version uses member functions from the AmalgamationInfo
591 // container class to unamalgamate the data. In contrast, the kokkos
592 // version of TentativePFactory does the unamalgamation here and only uses
593 // the data of the AmalgamationInfo container class
594
595 // Extract information for unamalgamation
596 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
597 GO indexBase;
598 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
599 GO globalOffset = amalgInfo->GlobalOffset();
600
601 // Extract aggregation info (already in Kokkos host views)
602 auto procWinner = aggregates->GetProcWinner()->getLocalViewDevice(Tpetra::Access::ReadOnly);
603 auto vertex2AggId = aggregates->GetVertex2AggId()->getLocalViewDevice(Tpetra::Access::ReadOnly);
604 const size_t numAggregates = aggregates->GetNumAggregates();
605
606 int myPID = aggregates->GetMap()->getComm()->getRank();
607
608 // Create Kokkos::View (on the device) to store the aggreate dof sizes
609 // Later used to get aggregate dof offsets
610 // NOTE: This zeros itself on construction
611 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
612 AggSizeType aggDofSizes;
613
614 if (stridedBlockSize == 1) {
615 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
616
617 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
618 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
619
620 auto sizesConst = aggregates->ComputeAggregateSizes();
621 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), sizesConst);
622
623 } else {
624 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
625
626 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
627 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
628
629 auto nodeMap = aggregates->GetMap()->getLocalMap();
630 auto dofMap = colMap->getLocalMap();
631
632 Kokkos::parallel_for(
633 "MueLu:TentativePF:Build:compute_agg_sizes", range_type(0, numAggregates),
634 KOKKOS_LAMBDA(const LO agg) {
635 auto aggRowView = aggGraph.rowConst(agg);
636
637 size_t size = 0;
638 for (LO colID = 0; colID < aggRowView.length; colID++) {
639 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
640
641 for (LO k = 0; k < stridedBlockSize; k++) {
642 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
643
644 if (dofMap.getLocalElement(dofGID) != INVALID)
645 size++;
646 }
647 }
648 aggDofSizes(agg + 1) = size;
649 });
650 }
651
652 // Find maximum dof size for aggregates
653 // Later used to reserve enough scratch space for local QR decompositions
654 LO maxAggSize = 0;
655 ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
656 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
657
658 // parallel_scan (exclusive)
659 // The aggDofSizes View then contains the aggregate dof offsets
660 Kokkos::parallel_scan(
661 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
662 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
663 update += aggDofSizes(i);
664 if (final_pass)
665 aggDofSizes(i) = update;
666 });
667
668 // Create Kokkos::View on the device to store mapping
669 // between (local) aggregate id and row map ids (LIDs)
670 Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
671 {
672 SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
673
674 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
675 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
676
677 Kokkos::parallel_for(
678 "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
679 KOKKOS_LAMBDA(const LO lnode) {
680 if (procWinner(lnode, 0) == myPID) {
681 // No need for atomics, it's one-to-one
682 auto aggID = vertex2AggId(lnode, 0);
683
684 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
685 // FIXME: I think this may be wrong
686 // We unconditionally add the whole block here. When we calculated
687 // aggDofSizes, we did the isLocalElement check. Something's fishy.
688 for (LO k = 0; k < stridedBlockSize; k++)
689 agg2RowMapLO(offset + k) = lnode * stridedBlockSize + k;
690 }
691 });
692 }
693
694 // STEP 2: prepare local QR decomposition
695 // Reserve memory for tentative prolongation operator
696 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
697
698 // Pull out the nullspace vectors so that we can have random access (on the device)
699 auto fineNS = fineNullspace->getLocalViewDevice(Tpetra::Access::ReadWrite);
700 auto coarseNS = coarseNullspace->getLocalViewDevice(Tpetra::Access::OverwriteAll);
701
702 size_t nnz = 0; // actual number of nnz
703
704 typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_device_type local_matrix_type;
705 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
706 typedef typename local_matrix_type::index_type::non_const_type cols_type;
707 typedef typename local_matrix_type::values_type::non_const_type vals_type;
708
709 // Device View for status (error messages...)
710 typedef Kokkos::View<int[10], DeviceType> status_type;
711 status_type status("status");
712
713 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
715
716 const ParameterList& pL = GetParameterList();
717 const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
718 if (!doQRStep) {
719 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
720 if (NSDim > 1)
721 GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
722 }
723
724 size_t nnzEstimate = numRows * NSDim;
725 rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows + 1);
726 cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
727 vals_type valsAux("Ptent_aux_vals", nnzEstimate);
728 rows_type rows("Ptent_rows", numRows + 1);
729 {
730 // Stage 0: fill in views.
731 SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
732
733 // The main thing to notice is initialization of vals with INVALID. These
734 // values will later be used to compress the arrays
735 Kokkos::parallel_for(
736 "MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows + 1),
737 KOKKOS_LAMBDA(const LO row) {
738 rowsAux(row) = row * NSDim;
739 });
740 Kokkos::parallel_for(
741 "MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
742 KOKKOS_LAMBDA(const LO j) {
743 colsAux(j) = INVALID;
744 });
745 }
746
747 if (NSDim == 1) {
748 // 1D is special, as it is the easiest. We don't even need to the QR,
749 // just normalize an array. Plus, no worries abot small aggregates. In
750 // addition, we do not worry about compression. It is unlikely that
751 // nullspace will have zeros. If it does, a prolongator row would be
752 // zero and we'll get singularity anyway.
753 SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
754
755 // Set up team policy with numAggregates teams and one thread per team.
756 // Each team handles a slice of the data associated with one aggregate
757 // and performs a local QR decomposition (in this case real QR is
758 // unnecessary).
759 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
760
761 if (doQRStep) {
762 Kokkos::parallel_for(
763 "MueLu:TentativePF:BuildUncoupled:main_loop", policy,
764 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
765 auto agg = thread.league_rank();
766
767 // size of the aggregate (number of DOFs in aggregate)
768 LO aggSize = aggRows(agg + 1) - aggRows(agg);
769
770 // Extract the piece of the nullspace corresponding to the aggregate, and
771 // put it in the flat array, "localQR" (in column major format) for the
772 // QR routine. Trivial in 1D.
773 auto norm = impl_ATS::magnitude(zero);
774
775 // Calculate QR by hand
776 // FIXME: shouldn't there be stridedblock here?
777 // FIXME_KOKKOS: shouldn't there be stridedblock here?
778 for (decltype(aggSize) k = 0; k < aggSize; k++) {
779 auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0));
780 norm += dnorm * dnorm;
781 }
782 norm = sqrt(norm);
783
784 if (norm == zero) {
785 // zero column; terminate the execution
786 statusAtomic(1) = true;
787 return;
788 }
789
790 // R = norm
791 coarseNS(agg, 0) = norm;
792
793 // Q = localQR(:,0)/norm
794 for (decltype(aggSize) k = 0; k < aggSize; k++) {
795 LO localRow = agg2RowMapLO(aggRows(agg) + k);
796 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0) / norm;
797
798 rows(localRow + 1) = 1;
799 colsAux(localRow) = agg;
800 valsAux(localRow) = localVal;
801 }
802 });
803
804 typename status_type::host_mirror_type statusHost = Kokkos::create_mirror_view(status);
805 Kokkos::deep_copy(statusHost, status);
806 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
807 if (statusHost(i)) {
808 std::ostringstream oss;
809 oss << "MueLu::TentativePFactory::MakeTentative: ";
810 switch (i) {
811 case 0: oss << "!goodMap is not implemented"; break;
812 case 1: oss << "fine level NS part has a zero column"; break;
813 }
814 throw Exceptions::RuntimeError(oss.str());
815 }
816
817 } else {
818 Kokkos::parallel_for(
819 "MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
820 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
821 auto agg = thread.league_rank();
822
823 // size of the aggregate (number of DOFs in aggregate)
824 LO aggSize = aggRows(agg + 1) - aggRows(agg);
825
826 // R = norm
827 coarseNS(agg, 0) = one;
828
829 // Q = localQR(:,0)/norm
830 for (decltype(aggSize) k = 0; k < aggSize; k++) {
831 LO localRow = agg2RowMapLO(aggRows(agg) + k);
832 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0);
833
834 rows(localRow + 1) = 1;
835 colsAux(localRow) = agg;
836 valsAux(localRow) = localVal;
837 }
838 });
839 }
840
841 Kokkos::parallel_reduce(
842 "MueLu:TentativeP:CountNNZ", range_type(0, numRows + 1),
843 KOKKOS_LAMBDA(const LO i, size_t& nnz_count) {
844 nnz_count += rows(i);
845 },
846 nnz);
847
848 } else { // NSdim > 1
849 // FIXME_KOKKOS: This code branch is completely unoptimized.
850 // Work to do:
851 // - Optimize QR decomposition
852 // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
853 // packing new values in the beginning of each row
854 // We do use auxilary view in this case, so keep a second rows view for
855 // counting nonzeros in rows
856
857 {
858 SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
859 // Set up team policy with numAggregates teams and one thread per team.
860 // Each team handles a slice of the data associated with one aggregate
861 // and performs a local QR decomposition
862 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1); // numAggregates teams a 1 thread
863 LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
864 decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
865 decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
866 decltype(valsAux)>
867 localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
869 Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
870 }
871
872 typename status_type::host_mirror_type statusHost = Kokkos::create_mirror_view(status);
873 Kokkos::deep_copy(statusHost, status);
874 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
875 if (statusHost(i)) {
876 std::ostringstream oss;
877 oss << "MueLu::TentativePFactory::MakeTentative: ";
878 switch (i) {
879 case 0: oss << "!goodMap is not implemented"; break;
880 case 1: oss << "fine level NS part has a zero column"; break;
881 }
882 throw Exceptions::RuntimeError(oss.str());
883 }
884 }
885
886 // Compress the cols and vals by ignoring INVALID column entries that correspond
887 // to 0 in QR.
888
889 // The real cols and vals are constructed using calculated (not estimated) nnz
890 cols_type cols;
891 vals_type vals;
892
893 if (nnz != nnzEstimate) {
894 {
895 // Stage 2: compress the arrays
896 SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
897
898 Kokkos::parallel_scan(
899 "MueLu:TentativePF:Build:compress_rows", range_type(0, numRows + 1),
900 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
901 upd += rows(i);
902 if (final)
903 rows(i) = upd;
904 });
905 }
906
907 {
908 SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
909
910 cols = cols_type("Ptent_cols", nnz);
911 vals = vals_type("Ptent_vals", nnz);
912
913 // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
914 // to the beginning of rows. See CoalesceDropFactory_kokkos for
915 // example.
916 Kokkos::parallel_for(
917 "MueLu:TentativePF:Build:compress_cols_vals", range_type(0, numRows),
918 KOKKOS_LAMBDA(const LO i) {
919 LO rowStart = rows(i);
920
921 size_t lnnz = 0;
922 for (auto j = rowsAux(i); j < rowsAux(i + 1); j++)
923 if (colsAux(j) != INVALID) {
924 cols(rowStart + lnnz) = colsAux(j);
925 vals(rowStart + lnnz) = valsAux(j);
926 lnnz++;
927 }
928 });
929 }
930
931 } else {
932 rows = rowsAux;
933 cols = colsAux;
934 vals = valsAux;
935 }
936
937 GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
938
939 {
940 // Stage 3: construct Xpetra::Matrix
941 SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
942
943 local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
944
945 // Managing labels & constants for ESFC
946 RCP<ParameterList> FCparams;
947 if (pL.isSublist("matrixmatrix: kernel params"))
948 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
949 else
950 FCparams = rcp(new ParameterList);
951
952 // By default, we don't need global constants for TentativeP
953 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
954 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
955
956 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
957 Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
958 }
959}
960
961template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
963 BuildPuncoupledBlockCrs(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
964 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
965 RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
966 RCP<MultiVector>& coarseNullspace, const int levelID) const {
967 /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could
968 be generalized later, if we ever need to do so:
969 1) Null space dimension === block size of matrix: So no elasticity right now
970 2) QR is not supported: Under assumption #1, this shouldn't cause problems.
971 3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
972
973 These assumptions keep our code way simpler and still support the use cases we actually care about.
974 */
975
976 RCP<const Map> rowMap = A->getRowMap();
977 RCP<const Map> rangeMap = A->getRangeMap();
978 RCP<const Map> colMap = A->getColMap();
979 // const size_t numFinePointRows = rangeMap->getLocalNumElements();
980 const size_t numFineBlockRows = rowMap->getLocalNumElements();
981
982 // typedef Teuchos::ScalarTraits<SC> STS;
983 // typedef typename STS::magnitudeType Magnitude;
984 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
985
986#if KOKKOS_VERSION >= 40799
987 typedef KokkosKernels::ArithTraits<SC> ATS;
988#else
989 typedef Kokkos::ArithTraits<SC> ATS;
990#endif
991 using impl_SC = typename ATS::val_type;
992#if KOKKOS_VERSION >= 40799
993 using impl_ATS = KokkosKernels::ArithTraits<impl_SC>;
994#else
995 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
996#endif
997 const impl_SC one = impl_ATS::one();
998
999 // const GO numAggs = aggregates->GetNumAggregates();
1000 const size_t NSDim = fineNullspace->getNumVectors();
1001 auto aggSizes = aggregates->ComputeAggregateSizes();
1002
1003 typename Aggregates::local_graph_type aggGraph;
1004 {
1005 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
1006 aggGraph = aggregates->GetGraph();
1007 }
1008 auto aggRows = aggGraph.row_map;
1009 auto aggCols = aggGraph.entries;
1010
1011 // Need to generate the coarse block map
1012 // NOTE: We assume NSDim == block size here
1013 // NOTE: We also assume that coarseMap has contiguous GIDs
1014 // const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
1015 const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1016 RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1017 Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
1018 numCoarseBlockRows,
1019 coarsePointMap->getIndexBase(),
1020 coarsePointMap->getComm());
1021 // Sanity checking
1022 const ParameterList& pL = GetParameterList();
1023 // const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
1024
1025 // The aggregates use the amalgamated column map, which in this case is what we want
1026
1027 // Aggregates map is based on the amalgamated column map
1028 // We can skip global-to-local conversion if LIDs in row map are
1029 // same as LIDs in column map
1030 bool goodMap = MueLu::Utilities<SC, LO, GO, NO>::MapsAreNested(*rowMap, *colMap);
1031 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
1032 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1033 "(i.e. \"matching\" row and column maps)");
1034
1035 // STEP 1: do unamalgamation
1036 // The non-kokkos version uses member functions from the AmalgamationInfo
1037 // container class to unamalgamate the data. In contrast, the kokkos
1038 // version of TentativePFactory does the unamalgamation here and only uses
1039 // the data of the AmalgamationInfo container class
1040
1041 // Extract information for unamalgamation
1042 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1043 GO indexBase;
1044 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1045 // GO globalOffset = amalgInfo->GlobalOffset();
1046
1047 // Extract aggregation info (already in Kokkos host views)
1048 auto procWinner = aggregates->GetProcWinner()->getLocalViewDevice(Tpetra::Access::ReadOnly);
1049 auto vertex2AggId = aggregates->GetVertex2AggId()->getLocalViewDevice(Tpetra::Access::ReadOnly);
1050 const size_t numAggregates = aggregates->GetNumAggregates();
1051
1052 int myPID = aggregates->GetMap()->getComm()->getRank();
1053
1054 // Create Kokkos::View (on the device) to store the aggreate dof sizes
1055 // Later used to get aggregate dof offsets
1056 // NOTE: This zeros itself on construction
1057 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
1058 AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
1059
1060 {
1061 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
1062
1063 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
1064 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
1065
1066 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), aggSizes);
1067 }
1068
1069 // Find maximum dof size for aggregates
1070 // Later used to reserve enough scratch space for local QR decompositions
1071 LO maxAggSize = 0;
1072 ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
1073 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1074
1075 // parallel_scan (exclusive)
1076 // The aggDofSizes View then contains the aggregate dof offsets
1077 Kokkos::parallel_scan(
1078 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
1079 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
1080 update += aggDofSizes(i);
1081 if (final_pass)
1082 aggDofSizes(i) = update;
1083 });
1084
1085 // Create Kokkos::View on the device to store mapping
1086 // between (local) aggregate id and row map ids (LIDs)
1087 Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
1088 {
1089 SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
1090
1091 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
1092 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1093
1094 Kokkos::parallel_for(
1095 "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
1096 KOKKOS_LAMBDA(const LO lnode) {
1097 if (procWinner(lnode, 0) == myPID) {
1098 // No need for atomics, it's one-to-one
1099 auto aggID = vertex2AggId(lnode, 0);
1100
1101 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
1102 // FIXME: I think this may be wrong
1103 // We unconditionally add the whole block here. When we calculated
1104 // aggDofSizes, we did the isLocalElement check. Something's fishy.
1105 for (LO k = 0; k < stridedBlockSize; k++)
1106 aggToRowMapLO(offset + k) = lnode * stridedBlockSize + k;
1107 }
1108 });
1109 }
1110
1111 // STEP 2: prepare local QR decomposition
1112 // Reserve memory for tentative prolongation operator
1113 coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1114
1115 // Pull out the nullspace vectors so that we can have random access (on the device)
1116 auto fineNS = fineNullspace->getLocalViewDevice(Tpetra::Access::ReadWrite);
1117 auto coarseNS = coarseNullspace->getLocalViewDevice(Tpetra::Access::OverwriteAll);
1118
1119 typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_device_type local_matrix_type;
1120 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1121 typedef typename local_matrix_type::index_type::non_const_type cols_type;
1122 // typedef typename local_matrix_type::values_type::non_const_type vals_type;
1123
1124 // Device View for status (error messages...)
1125 typedef Kokkos::View<int[10], DeviceType> status_type;
1126 status_type status("status");
1127
1128 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
1130
1131 // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
1132 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
1133
1134 // BlockCrs requires that we build the (block) graph first, so let's do that...
1135
1136 // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one
1137 // block non-zero per row in the matrix;
1138 rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1139 cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
1140
1141 Kokkos::parallel_for(
1142 "MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
1143 KOKKOS_LAMBDA(const LO j) {
1144 ia[j] = j;
1145 ja[j] = INVALID;
1146
1147 if (j == (LO)numFineBlockRows - 1)
1148 ia[numFineBlockRows] = numFineBlockRows;
1149 });
1150
1151 // Fill Graph
1152 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1153 Kokkos::parallel_for(
1154 "MueLu:TentativePF:BlockCrs:fillGraph", policy,
1155 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1156 auto agg = thread.league_rank();
1157 Xpetra::global_size_t offset = agg;
1158
1159 // size of the aggregate (number of DOFs in aggregate)
1160 LO aggSize = aggRows(agg + 1) - aggRows(agg);
1161
1162 for (LO j = 0; j < aggSize; j++) {
1163 // FIXME: Allow for bad maps
1164 const LO localRow = aggToRowMapLO[aggDofSizes[agg] + j];
1165 const size_t rowStart = ia[localRow];
1166 ja[rowStart] = offset;
1167 }
1168 });
1169
1170 // Compress storage (remove all INVALID, which happen when we skip zeros)
1171 // We do that in-place
1172 {
1173 // Stage 2: compress the arrays
1174 SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
1175 // Fill i_temp with the correct row starts
1176 rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1177 LO nnz = 0;
1178 Kokkos::parallel_scan(
1179 "MueLu:TentativePF:BlockCrs:compress_rows", range_type(0, numFineBlockRows),
1180 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
1181 if (final)
1182 i_temp[i] = upd;
1183 for (auto j = ia[i]; j < ia[i + 1]; j++)
1184 if (ja[j] != INVALID)
1185 upd++;
1186 if (final && i == (LO)numFineBlockRows - 1)
1187 i_temp[numFineBlockRows] = upd;
1188 },
1189 nnz);
1190
1191 cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
1192
1193 Kokkos::parallel_for(
1194 "MueLu:TentativePF:BlockCrs:compress_cols", range_type(0, numFineBlockRows),
1195 KOKKOS_LAMBDA(const LO i) {
1196 size_t rowStart = i_temp[i];
1197 size_t lnnz = 0;
1198 for (auto j = ia[i]; j < ia[i + 1]; j++)
1199 if (ja[j] != INVALID) {
1200 j_temp[rowStart + lnnz] = ja[j];
1201 lnnz++;
1202 }
1203 });
1204
1205 ia = i_temp;
1206 ja = j_temp;
1207 }
1208
1209 RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap, coarseBlockMap, ia, ja);
1210
1211 // Managing labels & constants for ESFC
1212 {
1213 RCP<ParameterList> FCparams;
1214 if (pL.isSublist("matrixmatrix: kernel params"))
1215 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
1216 else
1217 FCparams = rcp(new ParameterList);
1218 // By default, we don't need global constants for TentativeP
1219 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
1220 std::string levelIDs = toString(levelID);
1221 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + levelIDs);
1222 RCP<const Export> dummy_e;
1223 RCP<const Import> dummy_i;
1224 BlockGraph->expertStaticFillComplete(coarseBlockMap, rowMap, dummy_i, dummy_e, FCparams);
1225 }
1226
1227 // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
1228 // we clear them here
1229 ia = rows_type();
1230 ja = cols_type();
1231
1232 // Now let's make a BlockCrs Matrix
1233 // NOTE: Assumes block size== NSDim
1234 RCP<Xpetra::CrsMatrix<SC, LO, GO, NO>> P_xpetra = Xpetra::CrsMatrixFactory<SC, LO, GO, NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap, NSDim);
1235 RCP<Xpetra::TpetraBlockCrsMatrix<SC, LO, GO, NO>> P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC, LO, GO, NO>>(P_xpetra);
1236 if (P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1237 RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
1238
1239 auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1240 const LO stride = NSDim * NSDim;
1241
1242 Kokkos::parallel_for(
1243 "MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1244 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1245 auto agg = thread.league_rank();
1246
1247 // size of the aggregate (number of DOFs in aggregate)
1248 LO aggSize = aggRows(agg + 1) - aggRows(agg);
1249 Xpetra::global_size_t offset = agg * NSDim;
1250
1251 // Q = localQR(:,0)/norm
1252 for (LO j = 0; j < aggSize; j++) {
1253 LO localBlockRow = aggToRowMapLO(aggRows(agg) + j);
1254 LO rowStart = localBlockRow * stride;
1255 for (LO r = 0; r < (LO)NSDim; r++) {
1256 LO localPointRow = localBlockRow * NSDim + r;
1257 for (LO c = 0; c < (LO)NSDim; c++) {
1258 values[rowStart + r * NSDim + c] = fineNSRandom(localPointRow, c);
1259 }
1260 }
1261 }
1262
1263 // R = norm
1264 for (LO j = 0; j < (LO)NSDim; j++)
1265 coarseNS(offset + j, j) = one;
1266 });
1267
1268 Ptentative = P_wrap;
1269}
1270
1271template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1273 BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates> /* aggregates */,
1274 RCP<AmalgamationInfo> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
1275 RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
1276 RCP<MultiVector>& /* coarseNullspace */) const {
1277 throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
1278}
1279
1280template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1282 isGoodMap(const Map& rowMap, const Map& colMap) const {
1283 auto rowLocalMap = rowMap.getLocalMap();
1284 auto colLocalMap = colMap.getLocalMap();
1285
1286 const size_t numRows = rowLocalMap.getLocalNumElements();
1287 const size_t numCols = colLocalMap.getLocalNumElements();
1288
1289 if (numCols < numRows)
1290 return false;
1291
1292 size_t numDiff = 0;
1293 Kokkos::parallel_reduce(
1294 "MueLu:TentativePF:isGoodMap", range_type(0, numRows),
1295 KOKKOS_LAMBDA(const LO i, size_t& diff) {
1296 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1297 },
1298 numDiff);
1299
1300 return (numDiff == 0);
1301}
1302
1303} // namespace MueLu
1304
1305#define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1306#endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
View
#define SET_VALID_ENTRY(name)
maxAggDofSizeType maxAggDofSize
agg2RowMapLOType agg2RowMapLO
MueLu::DefaultLocalOrdinal LocalOrdinal
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
typename LWGraph_kokkos::local_graph_type local_graph_type
static void AmalgamateMap(const Map &sourceMap, const Matrix &A, RCP< const Map > &amalgamatedMap, Array< LO > &translation)
Method to create merged map for systems of PDEs.
Exception throws to report errors in the internal logical of the program.
Timer to be used in factories. Similar to Monitor but with additional timers.
Class that holds all level-specific information.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need's value has been saved.
int GetLevelID() const
Return level number.
static const NoFactory * get()
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
bool isGoodMap(const Map &rowMap, const Map &colMap) const
void BuildP(Level &fineLevel, Level &coarseLevel) const
Abstract Build method.
void BuildPuncoupledBlockCrs(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
void BuildPuncoupled(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
void Build(Level &fineLevel, Level &coarseLevel) const
Build an object with this factory.
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
void DeclareInput(Level &fineLevel, Level &coarseLevel) const
Input.
void BuildPcoupled(RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace) const
static bool MapsAreNested(const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &rowMap, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &colMap)
Namespace for MueLu classes and methods.
@ Warnings0
Important warning messages (one line)
@ Statistics2
Print even more statistics.
@ Runtime1
Description of what is happening (more verbose)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.