MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_RepartitionHeuristicFactory_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// MueLu: A package for multigrid based preconditioning
4//
5// Copyright 2012 NTESS and the MueLu contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_
11#define PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_
12
13#include <algorithm>
14#include <iostream>
15#include <sstream>
16
17#ifdef HAVE_MPI
18#include <Teuchos_DefaultMpiComm.hpp>
19#include <Teuchos_CommHelpers.hpp>
20
21//#include <Xpetra_Map.hpp>
22#include <Xpetra_Matrix.hpp>
23
24#include "MueLu_RAPFactory.hpp"
25#include "MueLu_BlockedRAPFactory.hpp"
26#include "MueLu_SubBlockAFactory.hpp"
27#include "MueLu_Level.hpp"
28#include "MueLu_MasterList.hpp"
29#include "MueLu_Monitor.hpp"
30
32
33namespace MueLu {
34
35template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
37
38template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
40
41template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
43 RCP<ParameterList> validParamList = rcp(new ParameterList());
44
45#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
46 SET_VALID_ENTRY("repartition: start level");
47 SET_VALID_ENTRY("repartition: use map");
48 SET_VALID_ENTRY("repartition: node repartition level");
49 SET_VALID_ENTRY("repartition: min rows per proc");
50 SET_VALID_ENTRY("repartition: target rows per proc");
51 SET_VALID_ENTRY("repartition: min rows per thread");
52 SET_VALID_ENTRY("repartition: target rows per thread");
53 SET_VALID_ENTRY("repartition: max imbalance");
54 SET_VALID_ENTRY("repartition: put on single proc");
55#undef SET_VALID_ENTRY
56
57 validParamList->set<RCP<const FactoryBase> >("A", Teuchos::null, "Factory of the matrix A");
58 validParamList->set<RCP<const FactoryBase> >("Map", Teuchos::null, "Factory of the map Map");
59 validParamList->set<RCP<const FactoryBase> >("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
60
61 return validParamList;
62}
63
64template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
66 const Teuchos::ParameterList& pL = GetParameterList();
67 if (pL.isParameter("repartition: use map")) {
68 const bool useMap = pL.get<bool>("repartition: use map");
69 if (useMap)
70 Input(currentLevel, "Map");
71 else
72 Input(currentLevel, "A");
73 } else
74 Input(currentLevel, "A");
75 if (pL.isParameter("repartition: node repartition level")) {
76 const int nodeRepartLevel = pL.get<int>("repartition: node repartition level");
77 if (currentLevel.GetLevelID() == nodeRepartLevel) {
78 Input(currentLevel, "Node Comm");
79 }
80 }
81}
82
83template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
85 FactoryMonitor m(*this, "Build", currentLevel);
86
87 const Teuchos::ParameterList& pL = GetParameterList();
88 // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation.
89 // TODO (JG): I don't really know if we want to do this.
90 const int startLevel = pL.get<int>("repartition: start level");
91 const int nodeRepartLevel = pL.get<int>("repartition: node repartition level");
92 LO minRowsPerProcess = pL.get<LO>("repartition: min rows per proc");
93 LO targetRowsPerProcess = pL.get<LO>("repartition: target rows per proc");
94 LO minRowsPerThread = pL.get<LO>("repartition: min rows per thread");
95 LO targetRowsPerThread = pL.get<LO>("repartition: target rows per thread");
96 LO putOnSingleProc = pL.get<LO>("repartition: put on single proc");
97 const double nonzeroImbalance = pL.get<double>("repartition: max imbalance");
98 const bool useMap = pL.get<bool>("repartition: use map");
99
100 int thread_per_mpi_rank = 1;
101#if defined(KOKKOS_ENABLE_OPENMP)
102 using execution_space = typename Node::device_type::execution_space;
103 if (std::is_same<execution_space, Kokkos::OpenMP>::value)
104 thread_per_mpi_rank = execution_space().concurrency();
105#endif
106
107 if (minRowsPerThread > 0)
108 // We ignore the value given by minRowsPerProcess and repartition based on threads instead
109 minRowsPerProcess = minRowsPerThread * thread_per_mpi_rank;
110
111 if (targetRowsPerThread == 0)
112 targetRowsPerThread = minRowsPerThread;
113
114 if (targetRowsPerThread > 0)
115 // We ignore the value given by targetRowsPerProcess and repartition based on threads instead
116 targetRowsPerProcess = targetRowsPerThread * thread_per_mpi_rank;
117
118 if (targetRowsPerProcess == 0)
119 targetRowsPerProcess = minRowsPerProcess;
120
121 // Stick this on the level so Zoltan2Interface can use this later
122 Set<LO>(currentLevel, "repartition: heuristic target rows per process", targetRowsPerProcess);
123
124 // Check for validity of the node repartition option
125 TEUCHOS_TEST_FOR_EXCEPTION(nodeRepartLevel >= startLevel, Exceptions::RuntimeError, "MueLu::RepartitionHeuristicFactory::Build(): If 'repartition: node repartition level' is set, it must be less than or equal to 'repartition: start level'");
126
127 RCP<Matrix> A;
128 RCP<const FactoryBase> Afact;
129 RCP<const Map> map;
130 if (!useMap) {
131 Afact = GetFactory("A");
132 if (!Afact.is_null() && Teuchos::rcp_dynamic_cast<const RAPFactory>(Afact) == Teuchos::null &&
133 Teuchos::rcp_dynamic_cast<const BlockedRAPFactory>(Afact) == Teuchos::null &&
134 Teuchos::rcp_dynamic_cast<const SubBlockAFactory>(Afact) == Teuchos::null) {
135 GetOStream(Warnings) << "MueLu::RepartitionHeuristicFactory::Build: The generation factory for A must "
136 "be a RAPFactory or a SubBlockAFactory providing the non-rebalanced matrix information! "
137 "It specifically must not be of type Rebalance(Blocked)AcFactory or similar. "
138 "Please check the input. Make also sure that \"number of partitions\" is provided to "
139 "the Interface class and the RepartitionFactory instance. Instead, we have a "
140 << Afact->description() << std::endl;
141 }
142 // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types.
143 A = Get<RCP<Matrix> >(currentLevel, "A");
144 map = A->getRowMap();
145 } else
146 map = Get<RCP<const Map> >(currentLevel, "Map");
147
148 // ======================================================================================================
149 // Determine whether partitioning is needed
150 // ======================================================================================================
151 // NOTE: most tests include some global communication, which is why we currently only do tests until we make
152 // a decision on whether to repartition. However, there is value in knowing how "close" we are to having to
153 // rebalance an operator. So, it would probably be beneficial to do and report *all* tests.
154
155 // NOTE: We should probably consider merging all of these sumAll/minAll things at some point.
156
157 // Test0: Should we do node repartitioning?
158 if (currentLevel.GetLevelID() == nodeRepartLevel && map->getComm()->getSize() > 1) {
159 RCP<const Teuchos::Comm<int> > NodeComm = Get<RCP<const Teuchos::Comm<int> > >(currentLevel, "Node Comm");
160 TEUCHOS_TEST_FOR_EXCEPTION(NodeComm.is_null(), Exceptions::RuntimeError, "MueLu::RepartitionHeuristicFactory::Build(): NodeComm is null.");
161
162 // If we only have one node, then we don't want to pop down to one rank
163 if (NodeComm()->getSize() != map->getComm()->getSize()) {
164 GetOStream(Statistics1) << "Repartitioning? YES: \n Within node only" << std::endl;
165 int nodeRank = NodeComm->getRank();
166
167 // Do a reduction to get the total number of nodes
168 int isZero = (nodeRank == 0);
169 int numNodes = 0;
170 Teuchos::reduceAll(*map->getComm(), Teuchos::REDUCE_SUM, isZero, Teuchos::outArg(numNodes));
171 Set(currentLevel, "number of partitions", numNodes);
172 return;
173 }
174 }
175
176 // Test1: skip repartitioning if current level is less than the specified minimum level for repartitioning
177 if (currentLevel.GetLevelID() < startLevel) {
178 GetOStream(Statistics1) << "Repartitioning? NO:"
179 << "\n current level = " << Teuchos::toString(currentLevel.GetLevelID()) << ", first level where repartitioning can happen is " + Teuchos::toString(startLevel) << std::endl;
180
181 // a negative number of processors means: no repartitioning
182 Set(currentLevel, "number of partitions", -1);
183
184 return;
185 }
186
187 RCP<const Teuchos::Comm<int> > origComm = map->getComm();
188 RCP<const Teuchos::Comm<int> > comm = origComm;
189
190 // Test 2: check whether A is actually distributed, i.e. more than one processor owns part of A
191 // TODO: this global communication can be avoided if we store the information with the matrix (it is known when matrix is created)
192 // TODO: further improvements could be achieved when we use subcommunicator for the active set. Then we only need to check its size
193
194 // TODO: The block transfer factories do not check correctly whether or not repartitioning actually took place.
195 {
196 if (comm->getSize() == 1 && Teuchos::rcp_dynamic_cast<const RAPFactory>(Afact) != Teuchos::null) {
197 GetOStream(Statistics1) << "Repartitioning? NO:"
198 << "\n comm size = 1" << std::endl;
199
200 Set(currentLevel, "number of partitions", -1);
201 return;
202 }
203
204 int numActiveProcesses = 0;
205 MueLu_sumAll(comm, Teuchos::as<int>((map->getLocalNumElements() > 0) ? 1 : 0), numActiveProcesses);
206
207 if (numActiveProcesses == 1) {
208 GetOStream(Statistics1) << "Repartitioning? NO:"
209 << "\n # processes with rows = " << Teuchos::toString(numActiveProcesses) << std::endl;
210
211 Set(currentLevel, "number of partitions", 1);
212 return;
213 }
214 }
215
216 // Test 2.5: Repartition down to one rank if we're below "repatition: put on single proc"
217 // This is (mostly) an ML-compatibility thing
218 if (putOnSingleProc && map->getGlobalNumElements() < (Xpetra::global_size_t)putOnSingleProc) {
219 GetOStream(Statistics1) << "Repartitioning? YES:"
220 << "\n # rows is below the single-proc threshold = " << putOnSingleProc << std::endl;
221
222 Set(currentLevel, "number of partitions", 1);
223 return;
224 }
225
226 bool test3 = false, test4 = false;
227 std::string msg3, msg4;
228
229 // Test3: check whether number of rows on any processor satisfies the minimum number of rows requirement
230 // NOTE: Test2 ensures that repartitionning is not done when there is only one processor (it may or may not satisfy Test3)
231 if (minRowsPerProcess > 0) {
232 LO numMyRows = Teuchos::as<LO>(map->getLocalNumElements()), minNumRows, LOMAX = Teuchos::OrdinalTraits<LO>::max();
233 LO haveFewRows = (numMyRows < minRowsPerProcess ? 1 : 0), numWithFewRows = 0;
234 MueLu_sumAll(comm, haveFewRows, numWithFewRows);
235 MueLu_minAll(comm, (numMyRows > 0 ? numMyRows : LOMAX), minNumRows);
236
237 // TODO: we could change it to repartition only if the number of processors with numRows < minNumRows is larger than some
238 // percentage of the total number. This way, we won't repartition if 2 out of 1000 processors don't have enough elements.
239 // I'm thinking maybe 20% threshold. To implement, simply add " && numWithFewRows < .2*numProcs" to the if statement.
240 if (numWithFewRows > 0)
241 test3 = true;
242
243 msg3 = "\n min # rows per proc = " + Teuchos::toString(minNumRows) + ", min allowable = " + Teuchos::toString(minRowsPerProcess);
244 }
245
246 // Test4: check whether the balance in the number of nonzeros per processor is greater than threshold
247 if (!test3) {
248 if (useMap)
249 msg4 = "";
250 else {
251 GO minNnz, maxNnz, numMyNnz = Teuchos::as<GO>(A->getLocalNumEntries());
252 MueLu_maxAll(comm, numMyNnz, maxNnz);
253 MueLu_minAll(comm, (numMyNnz > 0 ? numMyNnz : maxNnz), minNnz); // min nnz over all active processors
254 double imbalance = Teuchos::as<double>(maxNnz) / minNnz;
255
256 if (imbalance > nonzeroImbalance)
257 test4 = true;
258
259 msg4 = "\n nonzero imbalance = " + Teuchos::toString(imbalance) + ", max allowable = " + Teuchos::toString(nonzeroImbalance);
260 }
261 }
262
263 if (!test3 && !test4) {
264 GetOStream(Statistics1) << "Repartitioning? NO:" << msg3 + msg4 << std::endl;
265
266 // A negative number of partitions means: no repartitioning
267 Set(currentLevel, "number of partitions", -1);
268 return;
269 }
270
271 GetOStream(Statistics1) << "Repartitioning? YES:" << msg3 + msg4 << std::endl;
272
273 // ======================================================================================================
274 // Calculate number of partitions
275 // ======================================================================================================
276 // FIXME Quick way to figure out how many partitions there should be (same algorithm as ML)
277 // FIXME Should take into account nnz? Perhaps only when user is using min #nnz per row threshold.
278
279 // The number of partitions is calculated by the RepartitionFactory and stored in "number of partitions" variable on
280 // the current level. If this variable is already set (e.g., by another instance of RepartitionFactory) then this number
281 // is used. The "number of partitions" variable serves as basic communication between the RepartitionFactory (which
282 // requests a certain number of partitions) and the *Interface classes which call the underlying partitioning algorithms
283 // and produce the "Partition" array with the requested number of partitions.
284 const auto globalNumRows = Teuchos::as<GO>(map->getGlobalNumElements());
285 int numPartitions = 1;
286 if (globalNumRows >= targetRowsPerProcess) {
287 // Make sure that each CPU thread has approximately targetRowsPerProcess
288 numPartitions = std::max(Teuchos::as<int>(globalNumRows / targetRowsPerProcess), 1);
289 }
290 numPartitions = std::min(numPartitions, comm->getSize());
291
292 Set(currentLevel, "number of partitions", numPartitions);
293
294 GetOStream(Statistics1) << "Number of partitions to use = " << numPartitions << std::endl;
295} // Build
296} // namespace MueLu
297
298#endif // ifdef HAVE_MPI
299#endif /* PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_ */
#define SET_VALID_ENTRY(name)
#define MueLu_maxAll(rcpComm, in, out)
#define MueLu_sumAll(rcpComm, in, out)
#define MueLu_minAll(rcpComm, in, out)
Exception throws to report errors in the internal logical of the program.
Timer to be used in factories. Similar to Monitor but with additional timers.
Class that holds all level-specific information.
int GetLevelID() const
Return level number.
void DeclareInput(Level &currentLevel) const
Determines the data that RepartitionHeuristicFactory needs, and the factories that generate that data...
virtual ~RepartitionHeuristicFactory()
Destructor.
void Build(Level &currentLevel) const
Build an object with this factory.
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
Namespace for MueLu classes and methods.
@ Warnings
Print all warning messages.
@ Statistics1
Print more statistics.