MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// MueLu: A package for multigrid based preconditioning
4//
5// Copyright 2012 NTESS and the MueLu contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
11#define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
12
13#include "Kokkos_UnorderedMap.hpp"
14#include "Xpetra_CrsGraphFactory.hpp"
15
17
18#include "MueLu_Aggregates.hpp"
19#include "MueLu_AmalgamationInfo.hpp"
20#include "MueLu_AmalgamationFactory.hpp"
21
22#include "MueLu_MasterList.hpp"
23#include "MueLu_PerfUtils.hpp"
24#include "MueLu_Monitor.hpp"
25
26#include "Xpetra_IO.hpp"
27
28namespace MueLu {
29
30namespace { // anonymous
31
32template <class LocalOrdinal, class View>
33class ReduceMaxFunctor {
34 public:
35 ReduceMaxFunctor(View view)
36 : view_(view) {}
37
38 KOKKOS_INLINE_FUNCTION
39 void operator()(const LocalOrdinal& i, LocalOrdinal& vmax) const {
40 if (vmax < view_(i))
41 vmax = view_(i);
42 }
43
44 KOKKOS_INLINE_FUNCTION
45 void join(LocalOrdinal& dst, const LocalOrdinal& src) const {
46 if (dst < src) {
47 dst = src;
48 }
49 }
50
51 KOKKOS_INLINE_FUNCTION
52 void init(LocalOrdinal& dst) const {
53 dst = 0;
54 }
55
56 private:
57 View view_;
58};
59
60// local QR decomposition
61template <class LOType, class GOType, class SCType, class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
62class LocalQRDecompFunctor {
63 private:
64 typedef LOType LO;
65 typedef GOType GO;
66 typedef SCType SC;
67
68 typedef typename DeviceType::execution_space execution_space;
69 typedef typename KokkosKernels::ArithTraits<SC>::val_type impl_SC;
70 typedef KokkosKernels::ArithTraits<impl_SC> impl_ATS;
71 typedef typename impl_ATS::magnitudeType Magnitude;
72
73 typedef Kokkos::View<impl_SC**, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
74 typedef Kokkos::View<impl_SC*, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
75
76 private:
77 NspType fineNS;
78 NspType coarseNS;
79 aggRowsType aggRows;
80 maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
81 agg2RowMapLOType agg2RowMapLO;
82 statusType statusAtomic;
83 rowsType rows;
84 rowsAuxType rowsAux;
85 colsAuxType colsAux;
86 valsAuxType valsAux;
88
89 public:
90 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_)
91 : fineNS(fineNS_)
92 , coarseNS(coarseNS_)
93 , aggRows(aggRows_)
94 , maxAggDofSize(maxAggDofSize_)
95 , agg2RowMapLO(agg2RowMapLO_)
96 , statusAtomic(statusAtomic_)
97 , rows(rows_)
98 , rowsAux(rowsAux_)
99 , colsAux(colsAux_)
100 , valsAux(valsAux_)
101 , doQRStep(doQRStep_) {}
102
103 KOKKOS_INLINE_FUNCTION
104 void operator()(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread, size_t& nnz) const {
105 auto agg = thread.league_rank();
106
107 // size of aggregate: number of DOFs in aggregate
108 auto aggSize = aggRows(agg + 1) - aggRows(agg);
109
110 const impl_SC one = impl_ATS::one();
111 const impl_SC two = one + one;
112 const impl_SC zero = impl_ATS::zero();
113 const auto zeroM = impl_ATS::magnitude(zero);
114
115 int m = aggSize;
116 int n = fineNS.extent(1);
117
118 // calculate row offset for coarse nullspace
119 Xpetra::global_size_t offset = agg * n;
120
121 if (doQRStep) {
122 // Extract the piece of the nullspace corresponding to the aggregate
123 shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
124 for (int j = 0; j < n; j++)
125 for (int k = 0; k < m; k++)
126 r(k, j) = fineNS(agg2RowMapLO(aggRows(agg) + k), j);
127#if 0
128 printf("A\n");
129 for (int i = 0; i < m; i++) {
130 for (int j = 0; j < n; j++)
131 printf(" %5.3lf ", r(i,j));
132 printf("\n");
133 }
134#endif
135
136 // Calculate QR decomposition (standard)
137 shared_matrix q(thread.team_shmem(), m, m); // Q
138 if (m >= n) {
139 bool isSingular = false;
140
141 // Initialize Q^T
142 auto qt = q;
143 for (int i = 0; i < m; i++) {
144 for (int j = 0; j < m; j++)
145 qt(i, j) = zero;
146 qt(i, i) = one;
147 }
148
149 for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
150 // FIXME_KOKKOS: use team
151 Magnitude s = zeroM, norm, norm_x;
152 for (int i = k + 1; i < m; i++)
153 s += pow(impl_ATS::magnitude(r(i, k)), 2);
154 norm = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
155
156 if (norm == zero) {
157 isSingular = true;
158 break;
159 }
160
161 r(k, k) -= norm * one;
162
163 norm_x = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
164 if (norm_x == zeroM) {
165 // We have a single diagonal element in the column.
166 // No reflections required. Just need to restor r(k,k).
167 r(k, k) = norm * one;
168 continue;
169 }
170
171 // FIXME_KOKKOS: use team
172 for (int i = k; i < m; i++)
173 r(i, k) /= norm_x;
174
175 // Update R(k:m,k+1:n)
176 for (int j = k + 1; j < n; j++) {
177 // FIXME_KOKKOS: use team in the loops
178 impl_SC si = zero;
179 for (int i = k; i < m; i++)
180 si += r(i, k) * r(i, j);
181 for (int i = k; i < m; i++)
182 r(i, j) -= two * si * r(i, k);
183 }
184
185 // Update Q^T (k:m,k:m)
186 for (int j = k; j < m; j++) {
187 // FIXME_KOKKOS: use team in the loops
188 impl_SC si = zero;
189 for (int i = k; i < m; i++)
190 si += r(i, k) * qt(i, j);
191 for (int i = k; i < m; i++)
192 qt(i, j) -= two * si * r(i, k);
193 }
194
195 // Fix R(k:m,k)
196 r(k, k) = norm * one;
197 for (int i = k + 1; i < m; i++)
198 r(i, k) = zero;
199 }
200
201#if 0
202 // Q = (Q^T)^T
203 for (int i = 0; i < m; i++)
204 for (int j = 0; j < i; j++) {
205 impl_SC tmp = qt(i,j);
206 qt(i,j) = qt(j,i);
207 qt(j,i) = tmp;
208 }
209#endif
210
211 // Build coarse nullspace using the upper triangular part of R
212 for (int j = 0; j < n; j++)
213 for (int k = 0; k <= j; k++)
214 coarseNS(offset + k, j) = r(k, j);
215
216 if (isSingular) {
217 statusAtomic(1) = true;
218 return;
219 }
220
221 } else {
222 // Special handling for m < n (i.e. single node aggregates in structural mechanics)
223
224 // The local QR decomposition is not possible in the "overconstrained"
225 // case (i.e. number of columns in qr > number of rowsAux), which
226 // corresponds to #DOFs in Aggregate < n. For usual problems this
227 // is only possible for single node aggregates in structural mechanics.
228 // (Similar problems may arise in discontinuous Galerkin problems...)
229 // We bypass the QR decomposition and use an identity block in the
230 // tentative prolongator for the single node aggregate and transfer the
231 // corresponding fine level null space information 1-to-1 to the coarse
232 // level null space part.
233
234 // NOTE: The resulting tentative prolongation operator has
235 // (m*DofsPerNode-n) zero columns leading to a singular
236 // coarse level operator A. To deal with that one has the following
237 // options:
238 // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
239 // false) to add some identity block to the diagonal of the zero rowsAux
240 // in the coarse level operator A, such that standard level smoothers
241 // can be used again.
242 // - Use special (projection-based) level smoothers, which can deal
243 // with singular matrices (very application specific)
244 // - Adapt the code below to avoid zero columns. However, we do not
245 // support a variable number of DOFs per node in MueLu/Xpetra which
246 // makes the implementation really hard.
247 //
248 // FIXME: do we need to check for singularity here somehow? Zero
249 // columns would be easy but linear dependency would require proper QR.
250
251 // R = extended (by adding identity rowsAux) qr
252 for (int j = 0; j < n; j++)
253 for (int k = 0; k < n; k++)
254 if (k < m)
255 coarseNS(offset + k, j) = r(k, j);
256 else
257 coarseNS(offset + k, j) = (k == j ? one : zero);
258
259 // Q = I (rectangular)
260 for (int i = 0; i < m; i++)
261 for (int j = 0; j < n; j++)
262 q(i, j) = (j == i ? one : zero);
263 }
264
265 // Process each row in the local Q factor and fill helper arrays to assemble P
266 for (int j = 0; j < m; j++) {
267 LO localRow = agg2RowMapLO(aggRows(agg) + j);
268 size_t rowStart = rowsAux(localRow);
269 size_t lnnz = 0;
270 for (int k = 0; k < n; k++) {
271 // skip zeros
272 if (q(j, k) != zero) {
273 colsAux(rowStart + lnnz) = offset + k;
274 valsAux(rowStart + lnnz) = q(j, k);
275 lnnz++;
276 }
277 }
278 rows(localRow + 1) = lnnz;
279 nnz += lnnz;
280 }
281
282#if 0
283 printf("R\n");
284 for (int i = 0; i < m; i++) {
285 for (int j = 0; j < n; j++)
286 printf(" %5.3lf ", coarseNS(i,j));
287 printf("\n");
288 }
289
290 printf("Q\n");
291 for (int i = 0; i < aggSize; i++) {
292 for (int j = 0; j < aggSize; j++)
293 printf(" %5.3lf ", q(i,j));
294 printf("\n");
295 }
296#endif
297 } else {
299 // "no-QR" option //
301 // Local Q factor is just the fine nullspace support over the current aggregate.
302 // Local R factor is the identity.
303 // TODO I have not implemented any special handling for aggregates that are too
304 // TODO small to locally support the nullspace, as is done in the standard QR
305 // TODO case above.
306
307 for (int j = 0; j < m; j++) {
308 LO localRow = agg2RowMapLO(aggRows(agg) + j);
309 size_t rowStart = rowsAux(localRow);
310 size_t lnnz = 0;
311 for (int k = 0; k < n; k++) {
312 const impl_SC qr_jk = fineNS(localRow, k);
313 // skip zeros
314 if (qr_jk != zero) {
315 colsAux(rowStart + lnnz) = offset + k;
316 valsAux(rowStart + lnnz) = qr_jk;
317 lnnz++;
318 }
319 }
320 rows(localRow + 1) = lnnz;
321 nnz += lnnz;
322 }
323
324 for (int j = 0; j < n; j++)
325 coarseNS(offset + j, j) = one;
326 }
327 }
328
329 // amount of shared memory
330 size_t team_shmem_size(int /* team_size */) const {
331 if (doQRStep) {
332 int m = maxAggDofSize;
333 int n = fineNS.extent(1);
334 return shared_matrix::shmem_size(m, n) + // r
335 shared_matrix::shmem_size(m, m); // q
336 } else
337 return 0;
338 }
339};
340
341} // namespace
342
343template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
345 RCP<ParameterList> validParamList = rcp(new ParameterList());
346
347#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
348 SET_VALID_ENTRY("tentative: calculate qr");
349 SET_VALID_ENTRY("tentative: build coarse coordinates");
350#undef SET_VALID_ENTRY
351
352 validParamList->set<RCP<const FactoryBase>>("A", Teuchos::null, "Generating factory of the matrix A");
353 validParamList->set<RCP<const FactoryBase>>("Aggregates", Teuchos::null, "Generating factory of the aggregates");
354 validParamList->set<RCP<const FactoryBase>>("Nullspace", Teuchos::null, "Generating factory of the nullspace");
355 validParamList->set<RCP<const FactoryBase>>("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
356 validParamList->set<RCP<const FactoryBase>>("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
357 validParamList->set<RCP<const FactoryBase>>("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
358 validParamList->set<RCP<const FactoryBase>>("Coordinates", Teuchos::null, "Generating factory of the coordinates");
359 validParamList->set<RCP<const FactoryBase>>("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
360
361 // Make sure we don't recursively validate options for the matrixmatrix kernels
362 ParameterList norecurse;
363 norecurse.disableRecursiveValidation();
364 validParamList->set<ParameterList>("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
365
366 return validParamList;
367}
368
369template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
371 const ParameterList& pL = GetParameterList();
372 // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
373 std::string nspName = "Nullspace";
374 if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
375
376 Input(fineLevel, "A");
377 Input(fineLevel, "Aggregates");
378 Input(fineLevel, nspName);
379 Input(fineLevel, "UnAmalgamationInfo");
380 Input(fineLevel, "CoarseMap");
381 if (fineLevel.GetLevelID() == 0 &&
382 fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
383 pL.get<bool>("tentative: build coarse coordinates")) { // and we want coordinates on other levels
384 bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
385 Input(fineLevel, "Coordinates");
386 } else if (bTransferCoordinates_) {
387 Input(fineLevel, "Coordinates");
388 }
389}
390
391template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
393 return BuildP(fineLevel, coarseLevel);
394}
395
396template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
398 FactoryMonitor m(*this, "Build", coarseLevel);
399
400 typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
401 typedef Xpetra::MultiVectorFactory<coordinate_type, LO, GO, NO> RealValuedMultiVectorFactory;
402 const ParameterList& pL = GetParameterList();
403 std::string nspName = "Nullspace";
404 if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
405
406 auto A = Get<RCP<Matrix>>(fineLevel, "A");
407 auto aggregates = Get<RCP<Aggregates>>(fineLevel, "Aggregates");
408 auto amalgInfo = Get<RCP<AmalgamationInfo>>(fineLevel, "UnAmalgamationInfo");
409 auto fineNullspace = Get<RCP<MultiVector>>(fineLevel, nspName);
410 auto coarseMap = Get<RCP<const Map>>(fineLevel, "CoarseMap");
411 RCP<RealValuedMultiVector> fineCoords;
412 if (bTransferCoordinates_) {
413 fineCoords = Get<RCP<RealValuedMultiVector>>(fineLevel, "Coordinates");
414 }
415
416 RCP<Matrix> Ptentative;
417 // No coarse DoFs so we need to bail by setting Ptentattive to null and returning
418 // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize()
419 if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
420 Ptentative = Teuchos::null;
421 Set(coarseLevel, "P", Ptentative);
422 return;
423 }
424 RCP<MultiVector> coarseNullspace;
425 RCP<RealValuedMultiVector> coarseCoords;
426
427 if (bTransferCoordinates_) {
428 RCP<const Map> coarseCoordMap;
429
430 LO blkSize = 1;
431 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
432 blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
433
434 if (blkSize == 1) {
435 // Scalar system
436 // No amalgamation required, we can use the coarseMap
437 coarseCoordMap = coarseMap;
438 } else {
439 // Vector system
440 AmalgamationFactory<SC, LO, GO, NO>::AmalgamateMap(rcp_dynamic_cast<const StridedMap>(coarseMap), coarseCoordMap);
441 }
442
443 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors(), false);
444
445 // Create overlapped fine coordinates to reduce global communication
446 auto uniqueMap = fineCoords->getMap();
447 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
448 if (aggregates->AggregatesCrossProcessors()) {
449 auto nonUniqueMap = aggregates->GetMap();
450 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
451
452 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors(), false);
453 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
454 }
455
456 // The good new is that his graph has already been constructed for the
457 // TentativePFactory and was cached in Aggregates. So this is a no-op.
458 auto aggGraph = aggregates->GetGraph();
459 auto numAggs = aggGraph.numRows();
460
461 auto fineCoordsView = fineCoords->getLocalViewDevice(Tpetra::Access::ReadOnly);
462 auto coarseCoordsView = coarseCoords->getLocalViewDevice(Tpetra::Access::OverwriteAll);
463
464 // Fill in coarse coordinates
465 {
466 SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
467
468 const auto dim = fineCoords->getNumVectors();
469
470 typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
471 for (size_t j = 0; j < dim; j++) {
472 Kokkos::parallel_for(
473 "MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
474 KOKKOS_LAMBDA(const LO i) {
475 // A row in this graph represents all node ids in the aggregate
476 // Therefore, averaging is very easy
477
478 auto aggregate = aggGraph.rowConst(i);
479
480 coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
481 for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
482 sum += fineCoordsRandomView(aggregate(colID), j);
483
484 coarseCoordsView(i, j) = sum / aggregate.length;
485 });
486 }
487 }
488 }
489
490 if (!aggregates->AggregatesCrossProcessors()) {
491 if (Xpetra::Helpers<SC, LO, GO, NO>::isTpetraBlockCrs(A)) {
492 BuildPuncoupledBlockCrs(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
493 coarseLevel.GetLevelID());
494 } else {
495 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
496 }
497 } else
498 BuildPcoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
499
500 // If available, use striding information of fine level matrix A for range
501 // map and coarseMap as domain map; otherwise use plain range map of
502 // Ptent = plain range map of A for range map and coarseMap as domain map.
503 // NOTE:
504 // The latter is not really safe, since there is no striding information
505 // for the range map. This is not really a problem, since striding
506 // information is always available on the intermedium levels and the
507 // coarsest levels.
508 if (A->IsView("stridedMaps") == true)
509 Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
510 else
511 Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
512
513 if (bTransferCoordinates_) {
514 Set(coarseLevel, "Coordinates", coarseCoords);
515 }
516
517 // FIXME: We should remove the NodeComm on levels past the threshold
518 if (fineLevel.IsAvailable("Node Comm")) {
519 RCP<const Teuchos::Comm<int>> nodeComm = Get<RCP<const Teuchos::Comm<int>>>(fineLevel, "Node Comm");
520 Set<RCP<const Teuchos::Comm<int>>>(coarseLevel, "Node Comm", nodeComm);
521 }
522
523 Set(coarseLevel, "Nullspace", coarseNullspace);
524 Set(coarseLevel, "P", Ptentative);
525
526 if (IsPrint(Statistics2)) {
527 RCP<ParameterList> params = rcp(new ParameterList());
528 params->set("printLoadBalancingInfo", true);
529 GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
530 }
531}
532
533template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
535 BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
536 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
537 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
538 RCP<MultiVector>& coarseNullspace, const int levelID) const {
539 auto rowMap = A->getRowMap();
540 auto colMap = A->getColMap();
541
542 const size_t numRows = rowMap->getLocalNumElements();
543 const size_t NSDim = fineNullspace->getNumVectors();
544
545 typedef KokkosKernels::ArithTraits<SC> ATS;
546 using impl_SC = typename ATS::val_type;
547 using impl_ATS = KokkosKernels::ArithTraits<impl_SC>;
548 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
549
550 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
551
552 typename Aggregates::local_graph_type aggGraph;
553 {
554 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
555 aggGraph = aggregates->GetGraph();
556 }
557 auto aggRows = aggGraph.row_map;
558 auto aggCols = aggGraph.entries;
559
560 // Aggregates map is based on the amalgamated column map
561 // We can skip global-to-local conversion if LIDs in row map are
562 // same as LIDs in column map
563 bool goodMap;
564 {
565 SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
566 goodMap = isGoodMap(*rowMap, *colMap);
567 }
568 // FIXME_KOKKOS: need to proofread later code for bad maps
569 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
570 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
571 "(i.e. \"matching\" row and column maps)");
572
573 // STEP 1: do unamalgamation
574 // The non-kokkos version uses member functions from the AmalgamationInfo
575 // container class to unamalgamate the data. In contrast, the kokkos
576 // version of TentativePFactory does the unamalgamation here and only uses
577 // the data of the AmalgamationInfo container class
578
579 // Extract information for unamalgamation
580 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
581 GO indexBase;
582 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
583 GO globalOffset = amalgInfo->GlobalOffset();
584
585 // Extract aggregation info (already in Kokkos host views)
586 auto procWinner = aggregates->GetProcWinner()->getLocalViewDevice(Tpetra::Access::ReadOnly);
587 auto vertex2AggId = aggregates->GetVertex2AggId()->getLocalViewDevice(Tpetra::Access::ReadOnly);
588 const size_t numAggregates = aggregates->GetNumAggregates();
589
590 int myPID = aggregates->GetMap()->getComm()->getRank();
591
592 // Create Kokkos::View (on the device) to store the aggreate dof sizes
593 // Later used to get aggregate dof offsets
594 // NOTE: This zeros itself on construction
595 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
596 AggSizeType aggDofSizes;
597
598 if (stridedBlockSize == 1) {
599 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
600
601 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
602 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
603
604 auto sizesConst = aggregates->ComputeAggregateSizes();
605 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), sizesConst);
606
607 } else {
608 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
609
610 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
611 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
612
613 auto nodeMap = aggregates->GetMap()->getLocalMap();
614 auto dofMap = colMap->getLocalMap();
615
616 Kokkos::parallel_for(
617 "MueLu:TentativePF:Build:compute_agg_sizes", range_type(0, numAggregates),
618 KOKKOS_LAMBDA(const LO agg) {
619 auto aggRowView = aggGraph.rowConst(agg);
620
621 size_t size = 0;
622 for (LO colID = 0; colID < aggRowView.length; colID++) {
623 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
624
625 for (LO k = 0; k < stridedBlockSize; k++) {
626 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
627
628 if (dofMap.getLocalElement(dofGID) != INVALID)
629 size++;
630 }
631 }
632 aggDofSizes(agg + 1) = size;
633 });
634 }
635
636 // Find maximum dof size for aggregates
637 // Later used to reserve enough scratch space for local QR decompositions
638 LO maxAggSize = 0;
639 ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
640 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
641
642 // parallel_scan (exclusive)
643 // The aggDofSizes View then contains the aggregate dof offsets
644 Kokkos::parallel_scan(
645 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
646 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
647 update += aggDofSizes(i);
648 if (final_pass)
649 aggDofSizes(i) = update;
650 });
651
652 // Create Kokkos::View on the device to store mapping
653 // between (local) aggregate id and row map ids (LIDs)
654 Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
655 {
656 SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
657
658 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
659 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
660
661 Kokkos::parallel_for(
662 "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
663 KOKKOS_LAMBDA(const LO lnode) {
664 if (procWinner(lnode, 0) == myPID) {
665 // No need for atomics, it's one-to-one
666 auto aggID = vertex2AggId(lnode, 0);
667
668 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
669 // FIXME: I think this may be wrong
670 // We unconditionally add the whole block here. When we calculated
671 // aggDofSizes, we did the isLocalElement check. Something's fishy.
672 for (LO k = 0; k < stridedBlockSize; k++)
673 agg2RowMapLO(offset + k) = lnode * stridedBlockSize + k;
674 }
675 });
676 }
677
678 // STEP 2: prepare local QR decomposition
679 // Reserve memory for tentative prolongation operator
680 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim, true);
681
682 // Pull out the nullspace vectors so that we can have random access (on the device)
683 auto fineNS = fineNullspace->getLocalViewDevice(Tpetra::Access::ReadWrite);
684 auto coarseNS = coarseNullspace->getLocalViewDevice(Tpetra::Access::ReadWrite);
685
686 size_t nnz = 0; // actual number of nnz
687
688 typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_device_type local_matrix_type;
689 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
690 typedef typename local_matrix_type::index_type::non_const_type cols_type;
691 typedef typename local_matrix_type::values_type::non_const_type vals_type;
692
693 // Device View for status (error messages...)
694 typedef Kokkos::View<int[10], DeviceType> status_type;
695 status_type status("status");
696
697 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
699
700 const ParameterList& pL = GetParameterList();
701 const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
702 if (!doQRStep) {
703 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
704 if (NSDim > 1)
705 GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
706 }
707
708 size_t nnzEstimate = numRows * NSDim;
709 rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows + 1);
710 cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
711 vals_type valsAux("Ptent_aux_vals", nnzEstimate);
712 rows_type rows("Ptent_rows", numRows + 1);
713 {
714 // Stage 0: fill in views.
715 SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
716
717 // The main thing to notice is initialization of vals with INVALID. These
718 // values will later be used to compress the arrays
719 Kokkos::parallel_for(
720 "MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows + 1),
721 KOKKOS_LAMBDA(const LO row) {
722 rowsAux(row) = row * NSDim;
723 });
724 Kokkos::parallel_for(
725 "MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
726 KOKKOS_LAMBDA(const LO j) {
727 colsAux(j) = INVALID;
728 });
729 }
730
731 if (NSDim == 1) {
732 // 1D is special, as it is the easiest. We don't even need to the QR,
733 // just normalize an array. Plus, no worries abot small aggregates. In
734 // addition, we do not worry about compression. It is unlikely that
735 // nullspace will have zeros. If it does, a prolongator row would be
736 // zero and we'll get singularity anyway.
737 SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
738
739 // Set up team policy with numAggregates teams and one thread per team.
740 // Each team handles a slice of the data associated with one aggregate
741 // and performs a local QR decomposition (in this case real QR is
742 // unnecessary).
743 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
744
745 if (doQRStep) {
746 Kokkos::parallel_for(
747 "MueLu:TentativePF:BuildUncoupled:main_loop", policy,
748 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
749 auto agg = thread.league_rank();
750
751 // size of the aggregate (number of DOFs in aggregate)
752 LO aggSize = aggRows(agg + 1) - aggRows(agg);
753
754 // Extract the piece of the nullspace corresponding to the aggregate, and
755 // put it in the flat array, "localQR" (in column major format) for the
756 // QR routine. Trivial in 1D.
757 auto norm = impl_ATS::magnitude(zero);
758
759 // Calculate QR by hand
760 // FIXME: shouldn't there be stridedblock here?
761 // FIXME_KOKKOS: shouldn't there be stridedblock here?
762 for (decltype(aggSize) k = 0; k < aggSize; k++) {
763 auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0));
764 norm += dnorm * dnorm;
765 }
766 norm = sqrt(norm);
767
768 if (norm == zero) {
769 // zero column; terminate the execution
770 statusAtomic(1) = true;
771 return;
772 }
773
774 // R = norm
775 coarseNS(agg, 0) = norm;
776
777 // Q = localQR(:,0)/norm
778 for (decltype(aggSize) k = 0; k < aggSize; k++) {
779 LO localRow = agg2RowMapLO(aggRows(agg) + k);
780 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0) / norm;
781
782 rows(localRow + 1) = 1;
783 colsAux(localRow) = agg;
784 valsAux(localRow) = localVal;
785 }
786 });
787
788 typename status_type::host_mirror_type statusHost = Kokkos::create_mirror_view(status);
789 Kokkos::deep_copy(statusHost, status);
790 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
791 if (statusHost(i)) {
792 std::ostringstream oss;
793 oss << "MueLu::TentativePFactory::MakeTentative: ";
794 switch (i) {
795 case 0: oss << "!goodMap is not implemented"; break;
796 case 1: oss << "fine level NS part has a zero column"; break;
797 }
798 throw Exceptions::RuntimeError(oss.str());
799 }
800
801 } else {
802 Kokkos::parallel_for(
803 "MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
804 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
805 auto agg = thread.league_rank();
806
807 // size of the aggregate (number of DOFs in aggregate)
808 LO aggSize = aggRows(agg + 1) - aggRows(agg);
809
810 // R = norm
811 coarseNS(agg, 0) = one;
812
813 // Q = localQR(:,0)/norm
814 for (decltype(aggSize) k = 0; k < aggSize; k++) {
815 LO localRow = agg2RowMapLO(aggRows(agg) + k);
816 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0);
817
818 rows(localRow + 1) = 1;
819 colsAux(localRow) = agg;
820 valsAux(localRow) = localVal;
821 }
822 });
823 }
824
825 Kokkos::parallel_reduce(
826 "MueLu:TentativeP:CountNNZ", range_type(0, numRows + 1),
827 KOKKOS_LAMBDA(const LO i, size_t& nnz_count) {
828 nnz_count += rows(i);
829 },
830 nnz);
831
832 } else { // NSdim > 1
833 // FIXME_KOKKOS: This code branch is completely unoptimized.
834 // Work to do:
835 // - Optimize QR decomposition
836 // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
837 // packing new values in the beginning of each row
838 // We do use auxilary view in this case, so keep a second rows view for
839 // counting nonzeros in rows
840
841 {
842 SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
843 // Set up team policy with numAggregates teams and one thread per team.
844 // Each team handles a slice of the data associated with one aggregate
845 // and performs a local QR decomposition
846 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1); // numAggregates teams a 1 thread
847 LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
848 decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
849 decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
850 decltype(valsAux)>
851 localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
853 Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
854 }
855
856 typename status_type::host_mirror_type statusHost = Kokkos::create_mirror_view(status);
857 Kokkos::deep_copy(statusHost, status);
858 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
859 if (statusHost(i)) {
860 std::ostringstream oss;
861 oss << "MueLu::TentativePFactory::MakeTentative: ";
862 switch (i) {
863 case 0: oss << "!goodMap is not implemented"; break;
864 case 1: oss << "fine level NS part has a zero column"; break;
865 }
866 throw Exceptions::RuntimeError(oss.str());
867 }
868 }
869
870 // Compress the cols and vals by ignoring INVALID column entries that correspond
871 // to 0 in QR.
872
873 // The real cols and vals are constructed using calculated (not estimated) nnz
874 cols_type cols;
875 vals_type vals;
876
877 if (nnz != nnzEstimate) {
878 {
879 // Stage 2: compress the arrays
880 SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
881
882 Kokkos::parallel_scan(
883 "MueLu:TentativePF:Build:compress_rows", range_type(0, numRows + 1),
884 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
885 upd += rows(i);
886 if (final)
887 rows(i) = upd;
888 });
889 }
890
891 {
892 SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
893
894 cols = cols_type("Ptent_cols", nnz);
895 vals = vals_type("Ptent_vals", nnz);
896
897 // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
898 // to the beginning of rows. See CoalesceDropFactory_kokkos for
899 // example.
900 Kokkos::parallel_for(
901 "MueLu:TentativePF:Build:compress_cols_vals", range_type(0, numRows),
902 KOKKOS_LAMBDA(const LO i) {
903 LO rowStart = rows(i);
904
905 size_t lnnz = 0;
906 for (auto j = rowsAux(i); j < rowsAux(i + 1); j++)
907 if (colsAux(j) != INVALID) {
908 cols(rowStart + lnnz) = colsAux(j);
909 vals(rowStart + lnnz) = valsAux(j);
910 lnnz++;
911 }
912 });
913 }
914
915 } else {
916 rows = rowsAux;
917 cols = colsAux;
918 vals = valsAux;
919 }
920
921 GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
922
923 {
924 // Stage 3: construct Xpetra::Matrix
925 SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
926
927 local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
928
929 // Managing labels & constants for ESFC
930 RCP<ParameterList> FCparams;
931 if (pL.isSublist("matrixmatrix: kernel params"))
932 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
933 else
934 FCparams = rcp(new ParameterList);
935
936 // By default, we don't need global constants for TentativeP
937 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
938 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
939
940 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
941 Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
942 }
943}
944
945template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
947 BuildPuncoupledBlockCrs(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
948 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
949 RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
950 RCP<MultiVector>& coarseNullspace, const int levelID) const {
951 /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could
952 be generalized later, if we ever need to do so:
953 1) Null space dimension === block size of matrix: So no elasticity right now
954 2) QR is not supported: Under assumption #1, this shouldn't cause problems.
955 3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
956
957 These assumptions keep our code way simpler and still support the use cases we actually care about.
958 */
959
960 RCP<const Map> rowMap = A->getRowMap();
961 RCP<const Map> rangeMap = A->getRangeMap();
962 RCP<const Map> colMap = A->getColMap();
963 // const size_t numFinePointRows = rangeMap->getLocalNumElements();
964 const size_t numFineBlockRows = rowMap->getLocalNumElements();
965
966 // typedef Teuchos::ScalarTraits<SC> STS;
967 // typedef typename STS::magnitudeType Magnitude;
968 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
969
970 typedef KokkosKernels::ArithTraits<SC> ATS;
971 using impl_SC = typename ATS::val_type;
972 using impl_ATS = KokkosKernels::ArithTraits<impl_SC>;
973 const impl_SC one = impl_ATS::one();
974
975 // const GO numAggs = aggregates->GetNumAggregates();
976 const size_t NSDim = fineNullspace->getNumVectors();
977 auto aggSizes = aggregates->ComputeAggregateSizes();
978
979 typename Aggregates::local_graph_type aggGraph;
980 {
981 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
982 aggGraph = aggregates->GetGraph();
983 }
984 auto aggRows = aggGraph.row_map;
985 auto aggCols = aggGraph.entries;
986
987 // Need to generate the coarse block map
988 // NOTE: We assume NSDim == block size here
989 // NOTE: We also assume that coarseMap has contiguous GIDs
990 // const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
991 const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
992 RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
993 Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
994 numCoarseBlockRows,
995 coarsePointMap->getIndexBase(),
996 coarsePointMap->getComm());
997 // Sanity checking
998 const ParameterList& pL = GetParameterList();
999 // const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
1000
1001 // The aggregates use the amalgamated column map, which in this case is what we want
1002
1003 // Aggregates map is based on the amalgamated column map
1004 // We can skip global-to-local conversion if LIDs in row map are
1005 // same as LIDs in column map
1006 bool goodMap = MueLu::Utilities<SC, LO, GO, NO>::MapsAreNested(*rowMap, *colMap);
1007 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
1008 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1009 "(i.e. \"matching\" row and column maps)");
1010
1011 // STEP 1: do unamalgamation
1012 // The non-kokkos version uses member functions from the AmalgamationInfo
1013 // container class to unamalgamate the data. In contrast, the kokkos
1014 // version of TentativePFactory does the unamalgamation here and only uses
1015 // the data of the AmalgamationInfo container class
1016
1017 // Extract information for unamalgamation
1018 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1019 GO indexBase;
1020 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1021 // GO globalOffset = amalgInfo->GlobalOffset();
1022
1023 // Extract aggregation info (already in Kokkos host views)
1024 auto procWinner = aggregates->GetProcWinner()->getLocalViewDevice(Tpetra::Access::ReadOnly);
1025 auto vertex2AggId = aggregates->GetVertex2AggId()->getLocalViewDevice(Tpetra::Access::ReadOnly);
1026 const size_t numAggregates = aggregates->GetNumAggregates();
1027
1028 int myPID = aggregates->GetMap()->getComm()->getRank();
1029
1030 // Create Kokkos::View (on the device) to store the aggreate dof sizes
1031 // Later used to get aggregate dof offsets
1032 // NOTE: This zeros itself on construction
1033 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
1034 AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
1035
1036 {
1037 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
1038
1039 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
1040 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
1041
1042 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), aggSizes);
1043 }
1044
1045 // Find maximum dof size for aggregates
1046 // Later used to reserve enough scratch space for local QR decompositions
1047 LO maxAggSize = 0;
1048 ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
1049 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1050
1051 // parallel_scan (exclusive)
1052 // The aggDofSizes View then contains the aggregate dof offsets
1053 Kokkos::parallel_scan(
1054 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
1055 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
1056 update += aggDofSizes(i);
1057 if (final_pass)
1058 aggDofSizes(i) = update;
1059 });
1060
1061 // Create Kokkos::View on the device to store mapping
1062 // between (local) aggregate id and row map ids (LIDs)
1063 Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
1064 {
1065 SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
1066
1067 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
1068 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1069
1070 Kokkos::parallel_for(
1071 "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
1072 KOKKOS_LAMBDA(const LO lnode) {
1073 if (procWinner(lnode, 0) == myPID) {
1074 // No need for atomics, it's one-to-one
1075 auto aggID = vertex2AggId(lnode, 0);
1076
1077 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
1078 // FIXME: I think this may be wrong
1079 // We unconditionally add the whole block here. When we calculated
1080 // aggDofSizes, we did the isLocalElement check. Something's fishy.
1081 for (LO k = 0; k < stridedBlockSize; k++)
1082 aggToRowMapLO(offset + k) = lnode * stridedBlockSize + k;
1083 }
1084 });
1085 }
1086
1087 // STEP 2: prepare local QR decomposition
1088 // Reserve memory for tentative prolongation operator
1089 coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim, true);
1090
1091 // Pull out the nullspace vectors so that we can have random access (on the device)
1092 auto fineNS = fineNullspace->getLocalViewDevice(Tpetra::Access::ReadWrite);
1093 auto coarseNS = coarseNullspace->getLocalViewDevice(Tpetra::Access::ReadWrite);
1094
1095 typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_device_type local_matrix_type;
1096 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1097 typedef typename local_matrix_type::index_type::non_const_type cols_type;
1098 // typedef typename local_matrix_type::values_type::non_const_type vals_type;
1099
1100 // Device View for status (error messages...)
1101 typedef Kokkos::View<int[10], DeviceType> status_type;
1102 status_type status("status");
1103
1104 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
1106
1107 // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
1108 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
1109
1110 // BlockCrs requires that we build the (block) graph first, so let's do that...
1111
1112 // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one
1113 // block non-zero per row in the matrix;
1114 rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1115 cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
1116
1117 Kokkos::parallel_for(
1118 "MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
1119 KOKKOS_LAMBDA(const LO j) {
1120 ia[j] = j;
1121 ja[j] = INVALID;
1122
1123 if (j == (LO)numFineBlockRows - 1)
1124 ia[numFineBlockRows] = numFineBlockRows;
1125 });
1126
1127 // Fill Graph
1128 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1129 Kokkos::parallel_for(
1130 "MueLu:TentativePF:BlockCrs:fillGraph", policy,
1131 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1132 auto agg = thread.league_rank();
1133 Xpetra::global_size_t offset = agg;
1134
1135 // size of the aggregate (number of DOFs in aggregate)
1136 LO aggSize = aggRows(agg + 1) - aggRows(agg);
1137
1138 for (LO j = 0; j < aggSize; j++) {
1139 // FIXME: Allow for bad maps
1140 const LO localRow = aggToRowMapLO[aggDofSizes[agg] + j];
1141 const size_t rowStart = ia[localRow];
1142 ja[rowStart] = offset;
1143 }
1144 });
1145
1146 // Compress storage (remove all INVALID, which happen when we skip zeros)
1147 // We do that in-place
1148 {
1149 // Stage 2: compress the arrays
1150 SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
1151 // Fill i_temp with the correct row starts
1152 rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1153 LO nnz = 0;
1154 Kokkos::parallel_scan(
1155 "MueLu:TentativePF:BlockCrs:compress_rows", range_type(0, numFineBlockRows),
1156 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
1157 if (final)
1158 i_temp[i] = upd;
1159 for (auto j = ia[i]; j < ia[i + 1]; j++)
1160 if (ja[j] != INVALID)
1161 upd++;
1162 if (final && i == (LO)numFineBlockRows - 1)
1163 i_temp[numFineBlockRows] = upd;
1164 },
1165 nnz);
1166
1167 cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
1168
1169 Kokkos::parallel_for(
1170 "MueLu:TentativePF:BlockCrs:compress_cols", range_type(0, numFineBlockRows),
1171 KOKKOS_LAMBDA(const LO i) {
1172 size_t rowStart = i_temp[i];
1173 size_t lnnz = 0;
1174 for (auto j = ia[i]; j < ia[i + 1]; j++)
1175 if (ja[j] != INVALID) {
1176 j_temp[rowStart + lnnz] = ja[j];
1177 lnnz++;
1178 }
1179 });
1180
1181 ia = i_temp;
1182 ja = j_temp;
1183 }
1184
1185 RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap, coarseBlockMap, ia, ja);
1186
1187 // Managing labels & constants for ESFC
1188 {
1189 RCP<ParameterList> FCparams;
1190 if (pL.isSublist("matrixmatrix: kernel params"))
1191 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
1192 else
1193 FCparams = rcp(new ParameterList);
1194 // By default, we don't need global constants for TentativeP
1195 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
1196 std::string levelIDs = toString(levelID);
1197 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + levelIDs);
1198 RCP<const Export> dummy_e;
1199 RCP<const Import> dummy_i;
1200 BlockGraph->expertStaticFillComplete(coarseBlockMap, rowMap, dummy_i, dummy_e, FCparams);
1201 }
1202
1203 // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
1204 // we clear them here
1205 ia = rows_type();
1206 ja = cols_type();
1207
1208 // Now let's make a BlockCrs Matrix
1209 // NOTE: Assumes block size== NSDim
1210 RCP<Xpetra::CrsMatrix<SC, LO, GO, NO>> P_xpetra = Xpetra::CrsMatrixFactory<SC, LO, GO, NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap, NSDim);
1211 RCP<Xpetra::TpetraBlockCrsMatrix<SC, LO, GO, NO>> P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC, LO, GO, NO>>(P_xpetra);
1212 if (P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1213 RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
1214
1215 auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1216 const LO stride = NSDim * NSDim;
1217
1218 Kokkos::parallel_for(
1219 "MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1220 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1221 auto agg = thread.league_rank();
1222
1223 // size of the aggregate (number of DOFs in aggregate)
1224 LO aggSize = aggRows(agg + 1) - aggRows(agg);
1225 Xpetra::global_size_t offset = agg * NSDim;
1226
1227 // Q = localQR(:,0)/norm
1228 for (LO j = 0; j < aggSize; j++) {
1229 LO localBlockRow = aggToRowMapLO(aggRows(agg) + j);
1230 LO rowStart = localBlockRow * stride;
1231 for (LO r = 0; r < (LO)NSDim; r++) {
1232 LO localPointRow = localBlockRow * NSDim + r;
1233 for (LO c = 0; c < (LO)NSDim; c++) {
1234 values[rowStart + r * NSDim + c] = fineNSRandom(localPointRow, c);
1235 }
1236 }
1237 }
1238
1239 // R = norm
1240 for (LO j = 0; j < (LO)NSDim; j++)
1241 coarseNS(offset + j, j) = one;
1242 });
1243
1244 Ptentative = P_wrap;
1245}
1246
1247template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1249 BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates> /* aggregates */,
1250 RCP<AmalgamationInfo> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
1251 RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
1252 RCP<MultiVector>& /* coarseNullspace */) const {
1253 throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
1254}
1255
1256template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1258 isGoodMap(const Map& rowMap, const Map& colMap) const {
1259 auto rowLocalMap = rowMap.getLocalMap();
1260 auto colLocalMap = colMap.getLocalMap();
1261
1262 const size_t numRows = rowLocalMap.getLocalNumElements();
1263 const size_t numCols = colLocalMap.getLocalNumElements();
1264
1265 if (numCols < numRows)
1266 return false;
1267
1268 size_t numDiff = 0;
1269 Kokkos::parallel_reduce(
1270 "MueLu:TentativePF:isGoodMap", range_type(0, numRows),
1271 KOKKOS_LAMBDA(const LO i, size_t& diff) {
1272 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1273 },
1274 numDiff);
1275
1276 return (numDiff == 0);
1277}
1278
1279} // namespace MueLu
1280
1281#define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1282#endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
#define SET_VALID_ENTRY(name)
maxAggDofSizeType maxAggDofSize
agg2RowMapLOType agg2RowMapLO
MueLu::DefaultLocalOrdinal LocalOrdinal
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
typename LWGraph_kokkos::local_graph_type local_graph_type
static void AmalgamateMap(const Map &sourceMap, const Matrix &A, RCP< const Map > &amalgamatedMap, Array< LO > &translation)
Method to create merged map for systems of PDEs.
Exception throws to report errors in the internal logical of the program.
Timer to be used in factories. Similar to Monitor but with additional timers.
Class that holds all level-specific information.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need's value has been saved.
int GetLevelID() const
Return level number.
static const NoFactory * get()
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
bool isGoodMap(const Map &rowMap, const Map &colMap) const
void BuildP(Level &fineLevel, Level &coarseLevel) const
Abstract Build method.
void BuildPuncoupledBlockCrs(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
void BuildPuncoupled(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
void Build(Level &fineLevel, Level &coarseLevel) const
Build an object with this factory.
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
void DeclareInput(Level &fineLevel, Level &coarseLevel) const
Input.
void BuildPcoupled(RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace) const
static bool MapsAreNested(const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &rowMap, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &colMap)
Namespace for MueLu classes and methods.
@ Warnings0
Important warning messages (one line)
@ Statistics2
Print even more statistics.
@ Runtime1
Description of what is happening (more verbose)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.