10#ifndef __TPETRA_DISTRIBUTORLOWERTRIANGULARBLOCK_HPP
11#define __TPETRA_DISTRIBUTORLOWERTRIANGULARBLOCK_HPP
14#include "Tpetra_Distributor.hpp"
18#include "Tpetra_Map.hpp"
19#include "Tpetra_Operator.hpp"
20#include "Tpetra_Vector.hpp"
21#include "Tpetra_CrsMatrix.hpp"
26template <
typename gno_t,
typename scalar_t,
typename node_t>
27class DistributionLowerTriangularBlock :
public Distribution<gno_t, scalar_t> {
109 using Distribution<gno_t, scalar_t>::me;
110 using Distribution<gno_t, scalar_t>::np;
111 using Distribution<gno_t, scalar_t>::comm;
112 using Distribution<gno_t, scalar_t>::nrows;
113 using typename Distribution<gno_t, scalar_t>::NZindex_t;
114 using typename Distribution<gno_t, scalar_t>::LocalNZmap_t;
120 DistributionLowerTriangularBlock(
size_t nrows_,
121 const Teuchos::RCP<
const Teuchos::Comm<int> > &comm_,
122 const Teuchos::ParameterList ¶ms)
123 : Distribution<gno_t, scalar_t>(nrows_, comm_, params)
124 , initialDist(nrows_, comm_, params)
125 , sortByDegree(false)
126 , permMatrix(Teuchos::null)
127 , redistributed(false)
128 , chunksComputed(false)
131 nChunks = int(std::sqrt(
float(npnp)));
132 while (nChunks * (nChunks + 1) < npnp) nChunks++;
134 TEUCHOS_TEST_FOR_EXCEPTION(nChunks * (nChunks + 1) != npnp, std::logic_error,
135 "Number of processors np = " << np <<
" must satisfy np = q(q+1)/2 for some q"
136 <<
" for LowerTriangularBlock distribution");
137 nChunksPerRow = double(nChunks) / double(nrows);
139 const Teuchos::ParameterEntry *pe = params.getEntryPtr(
"sortByDegree");
140 if (pe != NULL) sortByDegree = pe->getValue<
bool>(&sortByDegree);
142 pe = params.getEntryPtr(
"readPerProcess");
143 if (pe != NULL) redistributed = pe->getValue<
bool>(&redistributed);
145 if (me == 0) std::cout <<
"\n LowerTriangularBlock Distribution: "
147 <<
"\n nChunks = " << nChunks
151 enum DistributionType DistType() {
return LowerTriangularBlock; }
153 bool areChunksComputed() {
return chunksComputed; }
155 Teuchos::Array<gno_t> getChunkCuts() {
159 throw std::runtime_error(
"Error: Requested chunk cuts have not been computed yet.");
166 inline bool VecMine(gno_t i) {
return initialDist.VecMine(i); }
171 bool Mine(gno_t i, gno_t j) {
176 return (procFromChunks(i, j) == me);
178 return initialDist.Mine(i, j);
181 inline bool Mine(gno_t i, gno_t j,
int p) {
return Mine(i, j); }
184 void Redistribute(LocalNZmap_t &localNZ) {
189 gno_t myFirstRow = initialDist.getFirstRow(me);
190 gno_t nMyRows = initialDist.getNumRow(me);
191 Teuchos::Array<gno_t> nnzPerRow(nMyRows, 0);
193 Teuchos::Array<int> rcvcnt(np);
194 Teuchos::Array<int> disp(np);
195 for (
int sum = 0, p = 0; p < np; p++) {
196 int prows = initialDist.getNumRow(p);
207 Teuchos::Array<gno_t> permuteIndex;
208 Teuchos::Array<gno_t> sortedOrder;
210 Teuchos::Array<gno_t> globalRowBuf;
214 globalRowBuf.resize(nrows, 0);
219 for (
auto it = localNZ.begin(); it != localNZ.end(); it++) {
220 gno_t I = it->first.first;
221 nnzPerRow[I - myFirstRow]++;
224 Teuchos::gatherv<int, gno_t>(nnzPerRow.getRawPtr(), nMyRows,
225 globalRowBuf.getRawPtr(),
226 rcvcnt.getRawPtr(), disp.getRawPtr(),
229 permuteIndex.resize(nrows);
230 sortedOrder.resize(nrows);
234 for (
size_t i = 0; i != nrows; i++) sortedOrder[i] = i;
236 std::sort(sortedOrder.begin(), sortedOrder.end(),
237 [&](
const size_t &a,
const size_t &b) {
238 return (globalRowBuf[a] > globalRowBuf[b]);
242 for (
size_t i = 0; i < nrows; i++) {
243 permuteIndex[sortedOrder[i]] = i;
247 Teuchos::broadcast<int, gno_t>(*comm, 0, permuteIndex(0, nrows));
256 Teuchos::Array<gno_t> myRows;
257 for (
size_t i = 0; i < nrows; i++) {
258 if (VecMine(i)) myRows.push_back(i);
262 Teuchos::OrdinalTraits<Tpetra::global_size_t>::invalid();
263 Teuchos::RCP<const map_t> permMap =
264 rcp(
new map_t(dummy, myRows(), 0, comm));
266 permMatrix = rcp(
new matrix_t(permMap, 1));
268 Teuchos::Array<gno_t> cols(1);
269 Teuchos::Array<scalar_t> vals(1);
272 for (
size_t i = 0; i < permMap->getLocalNumElements(); i++) {
273 gno_t gid = permMap->getGlobalElement(i);
274 cols[0] = permuteIndex[gid];
275 permMatrix->insertGlobalValues(gid, cols(), vals());
278 permMatrix->fillComplete(permMap, permMap);
284 nnzPerRow.assign(nMyRows, 0);
286 for (
auto it = localNZ.begin(); it != localNZ.end(); it++) {
287 gno_t I = (sortByDegree ? permuteIndex[it->first.first]
289 gno_t J = (sortByDegree ? permuteIndex[it->first.second]
292 nnzPerRow[it->first.first - myFirstRow]++;
300 Teuchos::gatherv<int, gno_t>(nnzPerRow.getRawPtr(), nMyRows,
301 globalRowBuf.getRawPtr(),
302 rcvcnt.getRawPtr(), disp.getRawPtr(),
305 Teuchos::Array<int>().swap(rcvcnt);
306 Teuchos::Array<int>().swap(disp);
309 Teuchos::reduceAll(*comm, Teuchos::REDUCE_SUM, 1, &nnz, &gNnz);
311 chunkCuts.resize(nChunks + 1, 0);
317 size_t target = gNnz / np;
318 size_t targetRunningTotal = 0;
319 size_t currentRunningTotal = 0;
321 for (
int chunkCnt = 0; chunkCnt < nChunks; chunkCnt++) {
322 targetRunningTotal = (target * (chunkCnt + 1));
323 currentRunningTotal = 0;
324 while (I <
static_cast<gno_t
>(nrows)) {
325 size_t nextNnz = (sortByDegree ? globalRowBuf[sortedOrder[I]]
327 if (currentRunningTotal + nextNnz <= targetRunningTotal) {
328 currentRunningTotal += nextNnz;
333 chunkCuts[chunkCnt + 1] = I;
335 chunkCuts[nChunks] =
static_cast<gno_t
>(nrows);
339 Teuchos::Array<gno_t>().swap(globalRowBuf);
341 Teuchos::broadcast<int, gno_t>(*comm, 0, chunkCuts(0, nChunks + 1));
342 chunksComputed =
true;
345 Kokkos::View<gno_t *, Kokkos::HostSpace> iOut(
"iOut", localNZ.size());
346 Kokkos::View<gno_t *, Kokkos::HostSpace> jOut(
"jOut", localNZ.size());
347 Kokkos::View<scalar_t *, Kokkos::HostSpace> vOut(
"vOut", localNZ.size());
348 Teuchos::Array<int> pOut(localNZ.size());
351 for (
auto it = localNZ.begin(); it != localNZ.end(); it++) {
352 iOut[sendCnt] = (sortByDegree ? permuteIndex[it->first.first]
354 jOut[sendCnt] = (sortByDegree ? permuteIndex[it->first.second]
356 if (jOut[sendCnt] <= iOut[sendCnt]) {
357 vOut[sendCnt] = it->second;
358 pOut[sendCnt] = procFromChunks(iOut[sendCnt], jOut[sendCnt]);
365 LocalNZmap_t().swap(localNZ);
366 if (sortByDegree) Teuchos::Array<gno_t>().swap(permuteIndex);
370 size_t nrecvs = plan.createFromSends(pOut(0, sendCnt));
371 Kokkos::View<gno_t *, Kokkos::HostSpace> iIn(
"iIn", nrecvs);
372 Kokkos::View<gno_t *, Kokkos::HostSpace> jIn(
"jIn", nrecvs);
373 Kokkos::View<scalar_t *, Kokkos::HostSpace> vIn(
"vIn", nrecvs);
376 auto sendIndices = std::make_pair(
static_cast<size_t>(0), sendCnt);
377 plan.doPostsAndWaits(Kokkos::subview(iOut, sendIndices), 1, iIn);
378 plan.doPostsAndWaits(Kokkos::subview(jOut, sendIndices), 1, jIn);
379 plan.doPostsAndWaits(Kokkos::subview(vOut, sendIndices), 1, vIn);
382 for (
size_t n = 0; n < nrecvs; n++) {
383 NZindex_t nz(iIn[n], jIn[n]);
384 localNZ[nz] = vIn[n];
387 redistributed =
true;
390 Teuchos::RCP<matrix_t> getPermutationMatrix()
const {
return permMatrix; }
394 Distribution1DLinear<gno_t, scalar_t, node_t> initialDist;
401 Teuchos::RCP<matrix_t> permMatrix;
415 double nChunksPerRow;
416 Teuchos::Array<gno_t> chunkCuts;
418 int findIdxInChunks(gno_t I) {
419 int m = I * nChunksPerRow;
420 while (I < chunkCuts[m]) m--;
421 while (I >= chunkCuts[m + 1]) m++;
425 int procFromChunks(gno_t I, gno_t J) {
426 int m = findIdxInChunks(I);
427 int n = findIdxInChunks(J);
428 int p = m * (m + 1) / 2 + n;
436template <
typename scalar_t,
437 class Node = ::Tpetra::Details::DefaultTypes::node_type>
438class LowerTriangularBlockOperator :
public Tpetra::Operator<scalar_t, Tpetra::Map<>::local_ordinal_type,
439 Tpetra::Map<>::global_ordinal_type,
452 LowerTriangularBlockOperator(
453 const Teuchos::RCP<const matrix_t> &lowerTriangularMatrix_,
455 : lowerTriangularMatrix(lowerTriangularMatrix_)
456 , permMatrix(dist.getPermutationMatrix()) {
459 TEUCHOS_TEST_FOR_EXCEPTION(
460 !lowerTriangularMatrix->getRangeMap()->isSameAs(
461 *lowerTriangularMatrix->getDomainMap()),
463 "The Domain and Range maps of the LowerTriangularBlock matrix "
468 vector_t diagByRowMap(lowerTriangularMatrix->getRowMap());
469 lowerTriangularMatrix->getLocalDiagCopy(diagByRowMap);
470 diag = Teuchos::rcp(
new vector_t(lowerTriangularMatrix->getRangeMap()));
472 lowerTriangularMatrix->getRangeMap());
473 diag->doExport(diagByRowMap, exporter,
Tpetra::ADD);
476 void apply(
const mvector_t &x, mvector_t &y, Teuchos::ETransp mode,
477 scalar_t alpha, scalar_t beta)
const {
478 scalar_t
ZERO = Teuchos::ScalarTraits<scalar_t>::zero();
479 scalar_t ONE = Teuchos::ScalarTraits<scalar_t>::one();
488 if (permMatrix == Teuchos::null) {
490 lowerTriangularMatrix->apply(x, y, Teuchos::NO_TRANS, alpha, beta);
493 lowerTriangularMatrix->apply(x, y, Teuchos::TRANS, alpha, ONE);
496 y.elementWiseMultiply(-alpha, *diag, x, ONE);
503 vector_t xtmp(x.getMap(), x.getNumVectors());
504 vector_t ytmp(y.getMap(), y.getNumVectors());
506 permMatrix->apply(x, xtmp, Teuchos::TRANS);
507 if (beta !=
ZERO) permMatrix->apply(y, ytmp, Teuchos::TRANS);
510 lowerTriangularMatrix->apply(xtmp, ytmp, Teuchos::NO_TRANS, alpha, beta);
513 lowerTriangularMatrix->apply(xtmp, ytmp, Teuchos::TRANS, alpha, ONE);
516 ytmp.elementWiseMultiply(-alpha, *diag, xtmp, ONE);
518 permMatrix->apply(ytmp, y, Teuchos::NO_TRANS);
522 Teuchos::RCP<const map_t> getDomainMap()
const {
523 return lowerTriangularMatrix->getDomainMap();
526 Teuchos::RCP<const map_t> getRangeMap()
const {
527 return lowerTriangularMatrix->getRangeMap();
530 bool hasTransposeApply()
const {
return true; }
533 const Teuchos::RCP<const matrix_t> lowerTriangularMatrix;
534 const Teuchos::RCP<const matrix_t> permMatrix;
535 Teuchos::RCP<vector_t> diag;
Functions for initializing and finalizing Tpetra.
Struct that holds views of the contents of a CrsMatrix.
Sets up and executes a communication plan for a Tpetra DistObject.
Abstract interface for operators (e.g., matrices and preconditioners).
virtual void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const =0
Computes the operator-multivector application.
int local_ordinal_type
Default value of Scalar template parameter.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
@ ZERO
Replace old values with zero.