Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_DistributorPlan.cpp
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
11
13#include "Teuchos_StandardParameterEntryValidators.hpp"
14#include "Tpetra_Util.hpp"
16#include <numeric>
17
18namespace Tpetra {
19namespace Details {
20
21std::string
23 if (sendType == DISTRIBUTOR_ISEND) {
24 return "Isend";
25 } else if (sendType == DISTRIBUTOR_SEND) {
26 return "Send";
27 } else if (sendType == DISTRIBUTOR_ALLTOALL) {
28 return "Alltoall";
29 }
30#if defined(HAVE_TPETRA_MPI)
31 else if (sendType == DISTRIBUTOR_IALLTOFEWV) {
32 return "Ialltofewv";
33 }
34#endif
35#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
37 return "MpiAdvanceAlltoall";
39 return "MpiAdvanceNbralltoallv";
40 }
41#endif
42 else {
43 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
44 "Invalid "
45 "EDistributorSendType enum value "
46 << sendType << ".");
47 }
48}
49
51DistributorSendTypeStringToEnum(const std::string_view s) {
52 if (s == "Isend") return DISTRIBUTOR_ISEND;
53 if (s == "Send") return DISTRIBUTOR_SEND;
54 if (s == "Alltoall") return DISTRIBUTOR_ALLTOALL;
55#if defined(HAVE_TPETRA_MPI)
56 if (s == "Ialltofewv") return DISTRIBUTOR_IALLTOFEWV;
57#endif
58#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
59 if (s == "MpiAdvanceAlltoall") return DISTRIBUTOR_MPIADVANCE_ALLTOALL;
60 if (s == "MpiAdvanceNbralltoallv") return DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV;
61#endif
62 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid string to convert to EDistributorSendType enum value: " << s);
63}
64
66const std::string& validSendTypeOrThrow(const std::string& s) {
67 const auto valids = distributorSendTypes();
68 if (std::find(valids.begin(), valids.end(), s) == valids.end()) {
69 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid string for EDistributorSendType enum value: " << s);
70 }
71 return s;
72}
73
74std::string
76 switch (how) {
77 case Details::DISTRIBUTOR_NOT_INITIALIZED:
78 return "Not initialized yet";
79 case Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS:
80 return "By createFromSends";
81 case Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS:
82 return "By createFromRecvs";
83 case Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS_N_RECVS:
84 return "By createFromSendsAndRecvs";
85 case Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE:
86 return "By createReverseDistributor";
87 case Details::DISTRIBUTOR_INITIALIZED_BY_COPY:
88 return "By copy constructor";
89 default:
90 return "INVALID";
91 }
92}
93
94DistributorPlan::DistributorPlan(Teuchos::RCP<const Teuchos::Comm<int>> comm)
95 : comm_(comm)
96 ,
98 mpixComm_(Teuchos::null)
99 ,
100#endif
101 howInitialized_(DISTRIBUTOR_NOT_INITIALIZED)
102 , reversePlan_(Teuchos::null)
103 , sendType_(DistributorSendTypeStringToEnum(Behavior::defaultSendType()))
104 , sendMessageToSelf_(false)
105 , numSendsToOtherProcs_(0)
106 , maxSendLength_(0)
107 , numReceives_(0)
108 , totalReceiveLength_(0) {
109}
110
111DistributorPlan::DistributorPlan(const DistributorPlan& otherPlan)
112 : comm_(otherPlan.comm_)
113 ,
114#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
115 mpixComm_(otherPlan.mpixComm_)
116 ,
117#endif
118 howInitialized_(DISTRIBUTOR_INITIALIZED_BY_COPY)
119 , reversePlan_(otherPlan.reversePlan_)
120 , sendType_(otherPlan.sendType_)
121 , sendMessageToSelf_(otherPlan.sendMessageToSelf_)
122 , numSendsToOtherProcs_(otherPlan.numSendsToOtherProcs_)
123 , procIdsToSendTo_(otherPlan.procIdsToSendTo_)
124 , startsTo_(otherPlan.startsTo_)
125 , lengthsTo_(otherPlan.lengthsTo_)
126 , maxSendLength_(otherPlan.maxSendLength_)
127 , indicesTo_(otherPlan.indicesTo_)
128 , numReceives_(otherPlan.numReceives_)
129 , totalReceiveLength_(otherPlan.totalReceiveLength_)
130 , lengthsFrom_(otherPlan.lengthsFrom_)
131 , procsFrom_(otherPlan.procsFrom_)
132 , startsFrom_(otherPlan.startsFrom_)
133 , indicesFrom_(otherPlan.indicesFrom_)
134#if defined(HAVE_TPETRACORE_MPI)
135 , roots_(otherPlan.roots_)
136#endif
137{
138}
139
140size_t DistributorPlan::createFromSends(const Teuchos::ArrayView<const int>& exportProcIDs) {
141 using std::endl;
142 using Teuchos::outArg;
143 using Teuchos::REDUCE_MAX;
144 using Teuchos::reduceAll;
145 const char rawPrefix[] = "Tpetra::DistributorPlan::createFromSends";
146 Tpetra::Details::ProfilingRegion pr("Tpetra::DistributorPlan::createFromSends");
147
148 const size_t numExports = exportProcIDs.size();
149 const int myProcID = comm_->getRank();
150 const int numProcs = comm_->getSize();
151 const bool debug = Details::Behavior::debug("Distributor");
152
153 // exportProcIDs tells us the communication pattern for this
154 // distributor. It dictates the way that the export data will be
155 // interpreted in doPosts(). We want to perform at most one
156 // send per process in doPosts; this is for two reasons:
157 // * minimize latency / overhead in the comm routines (nice)
158 // * match the number of receives and sends between processes
159 // (necessary)
160 //
161 // Teuchos::Comm requires that the data for a send are contiguous
162 // in a send buffer. Therefore, if the data in the send buffer
163 // for doPosts() are not contiguous, they will need to be copied
164 // into a contiguous buffer. The user has specified this
165 // noncontiguous pattern and we can't do anything about it.
166 // However, if they do not provide an efficient pattern, we will
167 // warn them if one of the following compile-time options has been
168 // set:
169 // * HAVE_TPETRA_PRINT_EFFICIENCY_WARNINGS
170 //
171 // If the data are contiguous, then we can post the sends in situ
172 // (i.e., without needing to copy them into a send buffer).
173 //
174 // Determine contiguity. There are a number of ways to do this:
175 // * If the export IDs are sorted, then all exports to a
176 // particular proc must be contiguous. This is what Epetra does.
177 // * If the export ID of the current export already has been
178 // listed, then the previous listing should correspond to the
179 // same export. This tests contiguity, but not sortedness.
180 //
181 // Both of these tests require O(n), where n is the number of
182 // exports. However, the latter will positively identify a greater
183 // portion of contiguous patterns. We use the latter method.
184 //
185 // Check to see if values are grouped by procs without gaps
186 // If so, indices_to -> 0.
187
188 if (debug) {
189 // Test whether any process in the communicator got an invalid
190 // process ID. If badID != -1 on this process, then it equals
191 // this process' rank. The max of all badID over all processes
192 // is the max rank which has an invalid process ID.
193 int badID = -1;
194 for (size_t i = 0; i < numExports; ++i) {
195 const int exportID = exportProcIDs[i];
196 if (exportID >= numProcs || exportID < 0) {
197 badID = myProcID;
198 break;
199 }
200 }
201 int gbl_badID;
202 reduceAll<int, int>(*comm_, REDUCE_MAX, badID, outArg(gbl_badID));
203 TEUCHOS_TEST_FOR_EXCEPTION(gbl_badID >= 0, std::runtime_error, rawPrefix << "Proc " << gbl_badID << ", perhaps among other processes, got a bad "
204 "send process ID.");
205 }
206
207 // Set up data structures for quick traversal of arrays.
208 // This contains the number of sends for each process ID.
209 //
210 // FIXME (mfh 20 Mar 2014) This is one of a few places in Tpetra
211 // that create an array of length the number of processes in the
212 // communicator (plus one). Given how this code uses this array,
213 // it should be straightforward to replace it with a hash table or
214 // some other more space-efficient data structure. In practice,
215 // most of the entries of starts should be zero for a sufficiently
216 // large process count, unless the communication pattern is dense.
217 // Note that it's important to be able to iterate through keys (i
218 // for which starts[i] is nonzero) in increasing order.
219 Teuchos::Array<size_t> starts(numProcs + 1, 0);
220
221 // numActive is the number of sends that are not Null
222 size_t numActive = 0;
223 int needSendBuff = 0; // Boolean
224
225 for (size_t i = 0; i < numExports; ++i) {
226 const int exportID = exportProcIDs[i];
227 if (exportID >= 0) {
228 // exportID is a valid process ID. Increment the number of
229 // messages this process will send to that process.
230 ++starts[exportID];
231
232 // If we're sending more than one message to process exportID,
233 // then it is possible that the data are not contiguous.
234 // Check by seeing if the previous process ID in the list
235 // (exportProcIDs[i-1]) is the same. It's safe to use i-1,
236 // because if starts[exportID] > 1, then i must be > 1 (since
237 // the starts array was filled with zeros initially).
238
239 // null entries break continuity.
240 // e.g., [ 0, 0, 0, 1, -99, 1, 2, 2, 2] is not contiguous
241 if (needSendBuff == 0 && starts[exportID] > 1 &&
242 exportID != exportProcIDs[i - 1]) {
243 needSendBuff = 1;
244 }
245 ++numActive;
246 }
247 }
248
249#if defined(HAVE_TPETRA_PRINT_EFFICIENCY_WARNINGS)
250 {
251 int global_needSendBuff;
252 reduceAll<int, int>(*comm_, REDUCE_MAX, needSendBuff,
253 outArg(global_needSendBuff));
255 global_needSendBuff != 0,
256 "::createFromSends: Grouping export IDs together by process rank often "
257 "improves performance.");
258 }
259#endif
260
261 // Determine from the caller's data whether or not the current
262 // process should send (a) message(s) to itself.
263 if (starts[myProcID] != 0) {
264 sendMessageToSelf_ = true;
265 } else {
266 sendMessageToSelf_ = false;
267 }
268
269 if (!needSendBuff) {
270 // grouped by proc, no send buffer or indicesTo_ needed
271 numSendsToOtherProcs_ = 0;
272 // Count total number of sends, i.e., total number of procs to
273 // which we are sending. This includes myself, if applicable.
274 for (int i = 0; i < numProcs; ++i) {
275 if (starts[i]) {
276 ++numSendsToOtherProcs_;
277 }
278 }
279
280 // Not only do we not need these, but we must clear them, as
281 // empty status of indicesTo is a flag used later.
282 indicesTo_.resize(0);
283 // Size these to numSendsToOtherProcs_; note, at the moment, numSendsToOtherProcs_
284 // includes self sends. Set their values to zeros.
285 procIdsToSendTo_.assign(numSendsToOtherProcs_, 0);
286 startsTo_.assign(numSendsToOtherProcs_, 0);
287 lengthsTo_.assign(numSendsToOtherProcs_, 0);
288
289 // set startsTo to the offset for each send (i.e., each proc ID)
290 // set procsTo to the proc ID for each send
291 // in interpreting this code, remember that we are assuming contiguity
292 // that is why index skips through the ranks
293 {
294 size_t procIndex = 0;
295 for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
296 while (exportProcIDs[procIndex] < 0) {
297 ++procIndex; // skip all negative proc IDs
298 }
299 startsTo_[i] = procIndex;
300 int procID = exportProcIDs[procIndex];
301 procIdsToSendTo_[i] = procID;
302 procIndex += starts[procID];
303 }
304 }
305 // sort the startsTo and proc IDs together, in ascending order, according
306 // to proc IDs
307 if (numSendsToOtherProcs_ > 0) {
308 sort2(procIdsToSendTo_.begin(), procIdsToSendTo_.end(), startsTo_.begin());
309 }
310 // compute the maximum send length
311 maxSendLength_ = 0;
312 for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
313 int procID = procIdsToSendTo_[i];
314 lengthsTo_[i] = starts[procID];
315 if ((procID != myProcID) && (lengthsTo_[i] > maxSendLength_)) {
316 maxSendLength_ = lengthsTo_[i];
317 }
318 }
319 } else {
320 // not grouped by proc, need send buffer and indicesTo_
321
322 // starts[i] is the number of sends to proc i
323 // numActive equals number of sends total, \sum_i starts[i]
324
325 // this loop starts at starts[1], so explicitly check starts[0]
326 if (starts[0] == 0) {
327 numSendsToOtherProcs_ = 0;
328 } else {
329 numSendsToOtherProcs_ = 1;
330 }
331 for (Teuchos::Array<size_t>::iterator i = starts.begin() + 1,
332 im1 = starts.begin();
333 i != starts.end(); ++i) {
334 if (*i != 0) ++numSendsToOtherProcs_;
335 *i += *im1;
336 im1 = i;
337 }
338 // starts[i] now contains the number of exports to procs 0 through i
339
340 for (Teuchos::Array<size_t>::reverse_iterator ip1 = starts.rbegin(),
341 i = starts.rbegin() + 1;
342 i != starts.rend(); ++i) {
343 *ip1 = *i;
344 ip1 = i;
345 }
346 starts[0] = 0;
347 // starts[i] now contains the number of exports to procs 0 through
348 // i-1, i.e., all procs before proc i
349
350 indicesTo_.resize(numActive);
351
352 for (size_t i = 0; i < numExports; ++i) {
353 if (exportProcIDs[i] >= 0) {
354 // record the offset to the sendBuffer for this export
355 indicesTo_[starts[exportProcIDs[i]]] = i;
356 // now increment the offset for this proc
357 ++starts[exportProcIDs[i]];
358 }
359 }
360 // our send buffer will contain the export data for each of the procs
361 // we communicate with, in order by proc id
362 // sendBuffer = {proc_0_data, proc_1_data, ..., proc_np-1_data}
363 // indicesTo now maps each export to the location in our send buffer
364 // associated with the export
365 // data for export i located at sendBuffer[indicesTo[i]]
366 //
367 // starts[i] once again contains the number of exports to
368 // procs 0 through i
369 for (int proc = numProcs - 1; proc != 0; --proc) {
370 starts[proc] = starts[proc - 1];
371 }
372 starts.front() = 0;
373 starts[numProcs] = numActive;
374 //
375 // starts[proc] once again contains the number of exports to
376 // procs 0 through proc-1
377 // i.e., the start of my data in the sendBuffer
378
379 // this contains invalid data at procs we don't care about, that is okay
380 procIdsToSendTo_.resize(numSendsToOtherProcs_);
381 startsTo_.resize(numSendsToOtherProcs_);
382 lengthsTo_.resize(numSendsToOtherProcs_);
383
384 // for each group of sends/exports, record the destination proc,
385 // the length, and the offset for this send into the
386 // send buffer (startsTo_)
387 maxSendLength_ = 0;
388 size_t snd = 0;
389 for (int proc = 0; proc < numProcs; ++proc) {
390 if (starts[proc + 1] != starts[proc]) {
391 lengthsTo_[snd] = starts[proc + 1] - starts[proc];
392 startsTo_[snd] = starts[proc];
393 // record max length for all off-proc sends
394 if ((proc != myProcID) && (lengthsTo_[snd] > maxSendLength_)) {
395 maxSendLength_ = lengthsTo_[snd];
396 }
397 procIdsToSendTo_[snd] = proc;
398 ++snd;
399 }
400 }
401 }
402
403 if (sendMessageToSelf_) {
404 --numSendsToOtherProcs_;
405 }
406
407 // Invert map to see what msgs are received and what length
408 computeReceives();
409
410#if defined(HAVE_TPETRA_MPI)
411 maybeInitializeRoots();
412#endif
413
414 // createFromRecvs() calls createFromSends(), but will set
415 // howInitialized_ again after calling createFromSends().
416 howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS;
417
418 return totalReceiveLength_;
419}
420
421void DistributorPlan::createFromRecvs(const Teuchos::ArrayView<const int>& remoteProcIDs) {
422 *this = *getReversePlan();
423 howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS;
424}
425
426void DistributorPlan::createFromSendsAndRecvs(const Teuchos::ArrayView<const int>& exportProcIDs,
427 const Teuchos::ArrayView<const int>& remoteProcIDs) {
428 // note the exportProcIDs and remoteProcIDs _must_ be a list that has
429 // an entry for each GID. If the export/remoteProcIDs is taken from
430 // the getProcs{From|To} lists that are extracted from a previous distributor,
431 // it will generate a wrong answer, because those lists have a unique entry
432 // for each processor id. A version of this with lengthsTo and lengthsFrom
433 // should be made.
434
435 howInitialized_ = Tpetra::Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS_N_RECVS;
436
437 int myProcID = comm_->getRank();
438 int numProcs = comm_->getSize();
439
440 const size_t numExportIDs = exportProcIDs.size();
441 Teuchos::Array<size_t> starts(numProcs + 1, 0);
442
443 size_t numActive = 0;
444 int needSendBuff = 0; // Boolean
445
446 for (size_t i = 0; i < numExportIDs; i++) {
447 if (needSendBuff == 0 && i && (exportProcIDs[i] < exportProcIDs[i - 1]))
448 needSendBuff = 1;
449 if (exportProcIDs[i] >= 0) {
450 ++starts[exportProcIDs[i]];
451 ++numActive;
452 }
453 }
454
455 sendMessageToSelf_ = (starts[myProcID] != 0) ? 1 : 0;
456
457 numSendsToOtherProcs_ = 0;
458
459 if (needSendBuff) // grouped by processor, no send buffer or indicesTo_ needed
460 {
461 if (starts[0] == 0) {
462 numSendsToOtherProcs_ = 0;
463 } else {
464 numSendsToOtherProcs_ = 1;
465 }
466 for (Teuchos::Array<size_t>::iterator i = starts.begin() + 1,
467 im1 = starts.begin();
468 i != starts.end(); ++i) {
469 if (*i != 0) ++numSendsToOtherProcs_;
470 *i += *im1;
471 im1 = i;
472 }
473 // starts[i] now contains the number of exports to procs 0 through i
474
475 for (Teuchos::Array<size_t>::reverse_iterator ip1 = starts.rbegin(),
476 i = starts.rbegin() + 1;
477 i != starts.rend(); ++i) {
478 *ip1 = *i;
479 ip1 = i;
480 }
481 starts[0] = 0;
482 // starts[i] now contains the number of exports to procs 0 through
483 // i-1, i.e., all procs before proc i
484
485 indicesTo_.resize(numActive);
486
487 for (size_t i = 0; i < numExportIDs; ++i) {
488 if (exportProcIDs[i] >= 0) {
489 // record the offset to the sendBuffer for this export
490 indicesTo_[starts[exportProcIDs[i]]] = i;
491 // now increment the offset for this proc
492 ++starts[exportProcIDs[i]];
493 }
494 }
495 for (int proc = numProcs - 1; proc != 0; --proc) {
496 starts[proc] = starts[proc - 1];
497 }
498 starts.front() = 0;
499 starts[numProcs] = numActive;
500 procIdsToSendTo_.resize(numSendsToOtherProcs_);
501 startsTo_.resize(numSendsToOtherProcs_);
502 lengthsTo_.resize(numSendsToOtherProcs_);
503 maxSendLength_ = 0;
504 size_t snd = 0;
505 for (int proc = 0; proc < numProcs; ++proc) {
506 if (starts[proc + 1] != starts[proc]) {
507 lengthsTo_[snd] = starts[proc + 1] - starts[proc];
508 startsTo_[snd] = starts[proc];
509 // record max length for all off-proc sends
510 if ((proc != myProcID) && (lengthsTo_[snd] > maxSendLength_)) {
511 maxSendLength_ = lengthsTo_[snd];
512 }
513 procIdsToSendTo_[snd] = proc;
514 ++snd;
515 }
516 }
517 } else {
518 // grouped by proc, no send buffer or indicesTo_ needed
519 numSendsToOtherProcs_ = 0;
520 // Count total number of sends, i.e., total number of procs to
521 // which we are sending. This includes myself, if applicable.
522 for (int i = 0; i < numProcs; ++i) {
523 if (starts[i]) {
524 ++numSendsToOtherProcs_;
525 }
526 }
527
528 // Not only do we not need these, but we must clear them, as
529 // empty status of indicesTo is a flag used later.
530 indicesTo_.resize(0);
531 // Size these to numSendsToOtherProcs_; note, at the moment, numSendsToOtherProcs_
532 // includes self sends. Set their values to zeros.
533 procIdsToSendTo_.assign(numSendsToOtherProcs_, 0);
534 startsTo_.assign(numSendsToOtherProcs_, 0);
535 lengthsTo_.assign(numSendsToOtherProcs_, 0);
536
537 // set startsTo to the offset for each send (i.e., each proc ID)
538 // set procsTo to the proc ID for each send
539 // in interpreting this code, remember that we are assuming contiguity
540 // that is why index skips through the ranks
541 {
542 size_t procIndex = 0;
543 for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
544 while (exportProcIDs[procIndex] < 0) {
545 ++procIndex; // skip all negative proc IDs
546 }
547 startsTo_[i] = procIndex;
548 int procID = exportProcIDs[procIndex];
549 procIdsToSendTo_[i] = procID;
550 procIndex += starts[procID];
551 }
552 }
553 // sort the startsTo and proc IDs together, in ascending order, according
554 // to proc IDs
555 if (numSendsToOtherProcs_ > 0) {
556 sort2(procIdsToSendTo_.begin(), procIdsToSendTo_.end(), startsTo_.begin());
557 }
558 // compute the maximum send length
559 maxSendLength_ = 0;
560 for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
561 int procID = procIdsToSendTo_[i];
562 lengthsTo_[i] = starts[procID];
563 if ((procID != myProcID) && (lengthsTo_[i] > maxSendLength_)) {
564 maxSendLength_ = lengthsTo_[i];
565 }
566 }
567 }
568
569 numSendsToOtherProcs_ -= sendMessageToSelf_;
570 std::vector<int> recv_list;
571 recv_list.reserve(numSendsToOtherProcs_); // reserve an initial guess for size needed
572
573 int last_pid = -2;
574 for (int i = 0; i < remoteProcIDs.size(); i++) {
575 if (remoteProcIDs[i] > last_pid) {
576 recv_list.push_back(remoteProcIDs[i]);
577 last_pid = remoteProcIDs[i];
578 } else if (remoteProcIDs[i] < last_pid)
579 throw std::runtime_error("Tpetra::Distributor:::createFromSendsAndRecvs expected RemotePIDs to be in sorted order");
580 }
581 numReceives_ = recv_list.size();
582 if (numReceives_) {
583 procsFrom_.assign(numReceives_, 0);
584 lengthsFrom_.assign(numReceives_, 0);
585 indicesFrom_.assign(numReceives_, 0);
586 startsFrom_.assign(numReceives_, 0);
587 }
588 for (size_t i = 0, j = 0; i < numReceives_; ++i) {
589 int jlast = j;
590 procsFrom_[i] = recv_list[i];
591 startsFrom_[i] = j;
592 for (; j < (size_t)remoteProcIDs.size() &&
593 remoteProcIDs[jlast] == remoteProcIDs[j];
594 j++) {
595 ;
596 }
597 lengthsFrom_[i] = j - jlast;
598 }
599 totalReceiveLength_ = remoteProcIDs.size();
600 indicesFrom_.clear();
601 numReceives_ -= sendMessageToSelf_;
602
603#if defined(HAVE_TPETRA_MPI)
604 maybeInitializeRoots();
605#endif
606}
607
608Teuchos::RCP<DistributorPlan> DistributorPlan::getReversePlan() const {
609 if (reversePlan_.is_null()) createReversePlan();
610 return reversePlan_;
611}
612
613void DistributorPlan::createReversePlan() const {
614 reversePlan_ = Teuchos::rcp(new DistributorPlan(comm_));
615 reversePlan_->howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE;
616 reversePlan_->sendType_ = sendType_;
617
618#if defined(HAVE_TPETRACORE_MPI)
619 // If the forward plan matches an all-to-few communication pattern,
620 // the reverse plan is few-to-all, so don't use a special all-to-few
621 // implementation for it
622 if (DISTRIBUTOR_IALLTOFEWV == sendType_) {
623 if (Behavior::verbose()) {
624 std::stringstream ss;
625 ss << __FILE__ << ":" << __LINE__ << " WARNING (Ialltofewv send type): using default for reversed Ialltofewv\n";
626 std::cerr << ss.str();
627 }
628
629 reversePlan_->sendType_ = DistributorSendTypeStringToEnum(Behavior::defaultSendType());
630 }
631#endif
632
633 // The total length of all the sends of this DistributorPlan. We
634 // calculate it because it's the total length of all the receives
635 // of the reverse DistributorPlan.
636 size_t totalSendLength =
637 std::accumulate(lengthsTo_.begin(), lengthsTo_.end(), 0);
638
639 // The maximum length of any of the receives of this DistributorPlan.
640 // We calculate it because it's the maximum length of any of the
641 // sends of the reverse DistributorPlan.
642 size_t maxReceiveLength = 0;
643 const int myProcID = comm_->getRank();
644 for (size_t i = 0; i < numReceives_; ++i) {
645 if (procsFrom_[i] != myProcID) {
646 // Don't count receives for messages sent by myself to myself.
647 if (lengthsFrom_[i] > maxReceiveLength) {
648 maxReceiveLength = lengthsFrom_[i];
649 }
650 }
651 }
652
653 reversePlan_->sendMessageToSelf_ = sendMessageToSelf_;
654 reversePlan_->numSendsToOtherProcs_ = numReceives_;
655 reversePlan_->procIdsToSendTo_ = procsFrom_;
656 reversePlan_->startsTo_ = startsFrom_;
657 reversePlan_->lengthsTo_ = lengthsFrom_;
658 reversePlan_->maxSendLength_ = maxReceiveLength;
659 reversePlan_->indicesTo_ = indicesFrom_;
660 reversePlan_->numReceives_ = numSendsToOtherProcs_;
661 reversePlan_->totalReceiveLength_ = totalSendLength;
662 reversePlan_->lengthsFrom_ = lengthsTo_;
663 reversePlan_->procsFrom_ = procIdsToSendTo_;
664 reversePlan_->startsFrom_ = startsTo_;
665 reversePlan_->indicesFrom_ = indicesTo_;
666
667#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
668 // is there a smarter way to do this
669 reversePlan_->initializeMpiAdvance();
670#endif
671
672#if defined(HAVE_TPETRA_MPI)
673 reversePlan_->maybeInitializeRoots();
674#endif
675}
676
677void DistributorPlan::computeReceives() {
678 using Teuchos::Array;
679 using Teuchos::ArrayRCP;
680 using Teuchos::as;
681 using Teuchos::CommRequest;
682 using Teuchos::CommStatus;
683 using Teuchos::ireceive;
684 using Teuchos::RCP;
685 using Teuchos::rcp;
686 using Teuchos::receive;
687 using Teuchos::reduce;
688 using Teuchos::REDUCE_SUM;
689 using Teuchos::scatter;
690 using Teuchos::send;
691 using Teuchos::waitAll;
692 Tpetra::Details::ProfilingRegion pr("Tpetra::DistributorPlan::computeReceives");
693
694 const int myRank = comm_->getRank();
695 const int numProcs = comm_->getSize();
696
697 const int mpiTag = DEFAULT_MPI_TAG;
698
699 // toProcsFromMe[i] == the number of messages sent by this process
700 // to process i. The data in numSendsToOtherProcs_, procIdsToSendTo_, and lengthsTo_
701 // concern the contiguous sends. Therefore, each process will be
702 // listed in procIdsToSendTo_ at most once, and so toProcsFromMe[i] will
703 // either be 0 or 1.
704 {
705 Array<int> toProcsFromMe(numProcs, 0);
706#ifdef HAVE_TPETRA_DEBUG
707 bool counting_error = false;
708#endif // HAVE_TPETRA_DEBUG
709 for (size_t i = 0; i < (numSendsToOtherProcs_ + (sendMessageToSelf_ ? 1 : 0)); ++i) {
710#ifdef HAVE_TPETRA_DEBUG
711 if (toProcsFromMe[procIdsToSendTo_[i]] != 0) {
712 counting_error = true;
713 }
714#endif // HAVE_TPETRA_DEBUG
715 toProcsFromMe[procIdsToSendTo_[i]] = 1;
716 }
717#ifdef HAVE_TPETRA_DEBUG
718 // Note that SHARED_TEST_FOR_EXCEPTION does a global reduction
719 SHARED_TEST_FOR_EXCEPTION(counting_error, std::logic_error,
720 "Tpetra::Distributor::computeReceives: There was an error on at least "
721 "one process in counting the number of messages send by that process to "
722 "the other processs. Please report this bug to the Tpetra developers.",
723 *comm_);
724#endif // HAVE_TPETRA_DEBUG
725
726 // Compute the number of receives that this process needs to
727 // post. The number of receives includes any self sends (i.e.,
728 // messages sent by this process to itself).
729 //
730 // (We will use numReceives_ this below to post exactly that
731 // number of receives, with MPI_ANY_SOURCE as the sending rank.
732 // This will tell us from which processes this process expects
733 // to receive, and how many packets of data we expect to receive
734 // from each process.)
735 //
736 // toProcsFromMe[i] is the number of messages sent by this
737 // process to process i. Compute the sum (elementwise) of all
738 // the toProcsFromMe arrays on all processes in the
739 // communicator. If the array x is that sum, then if this
740 // process has rank j, x[j] is the number of messages sent
741 // to process j, that is, the number of receives on process j
742 // (including any messages sent by process j to itself).
743 //
744 // Yes, this requires storing and operating on an array of
745 // length P, where P is the number of processes in the
746 // communicator. Epetra does this too. Avoiding this O(P)
747 // memory bottleneck would require some research.
748 //
749 // mfh 09 Jan 2012, 15 Jul 2015: There are three ways to
750 // implement this O(P) memory algorithm.
751 //
752 // 1. Use MPI_Reduce and MPI_Scatter: reduce on the root
753 // process (0) from toProcsFromMe, to numRecvsOnEachProc.
754 // Then, scatter the latter, so that each process p gets
755 // numRecvsOnEachProc[p].
756 //
757 // 2. Like #1, but use MPI_Reduce_scatter instead of
758 // MPI_Reduce and MPI_Scatter. MPI_Reduce_scatter might be
759 // optimized to reduce the number of messages, but
760 // MPI_Reduce_scatter is more general than we need (it
761 // allows the equivalent of MPI_Scatterv). See Bug 6336.
762 //
763 // 3. Do an all-reduce on toProcsFromMe, and let my process
764 // (with rank myRank) get numReceives_ from
765 // toProcsFromMe[myRank]. The HPCCG miniapp uses the
766 // all-reduce method.
767 //
768 // Approaches 1 and 3 have the same critical path length.
769 // However, #3 moves more data. This is because the final
770 // result is just one integer, but #3 moves a whole array of
771 // results to all the processes. This is why we use Approach 1
772 // here.
773 //
774 // mfh 12 Apr 2013: See discussion in createFromSends() about
775 // how we could use this communication to propagate an error
776 // flag for "free" in a release build.
777
778 const int root = 0; // rank of root process of the reduction
779 Array<int> numRecvsOnEachProc; // temp; only needed on root
780 if (myRank == root) {
781 numRecvsOnEachProc.resize(numProcs);
782 }
783 int numReceivesAsInt = 0; // output
784 reduce<int, int>(toProcsFromMe.getRawPtr(),
785 numRecvsOnEachProc.getRawPtr(),
786 numProcs, REDUCE_SUM, root, *comm_);
787 scatter<int, int>(numRecvsOnEachProc.getRawPtr(), 1,
788 &numReceivesAsInt, 1, root, *comm_);
789 numReceives_ = static_cast<size_t>(numReceivesAsInt);
790 }
791
792 // Now we know numReceives_, which is this process' number of
793 // receives. Allocate the lengthsFrom_ and procsFrom_ arrays
794 // with this number of entries.
795 lengthsFrom_.assign(numReceives_, 0);
796 procsFrom_.assign(numReceives_, 0);
797
798 //
799 // Ask (via nonblocking receive) each process from which we are
800 // receiving how many packets we should expect from it in the
801 // communication pattern.
802 //
803
804 // At this point, numReceives_ includes any self message that
805 // there may be. At the end of this routine, we'll subtract off
806 // the self message (if there is one) from numReceives_. In this
807 // routine, we don't need to receive a message from ourselves in
808 // order to figure out our lengthsFrom_ and source process ID; we
809 // can just ask ourselves directly. Thus, the actual number of
810 // nonblocking receives we post here does not include the self
811 // message.
812 const size_t actualNumReceives = numReceives_ - (sendMessageToSelf_ ? 1 : 0);
813
814 // Teuchos' wrapper for nonblocking receives requires receive
815 // buffers that it knows won't go away. This is why we use RCPs,
816 // one RCP per nonblocking receive request. They get allocated in
817 // the loop below.
818 Array<RCP<CommRequest<int>>> requests(actualNumReceives);
819 Array<ArrayRCP<size_t>> lengthsFromBuffers(actualNumReceives);
820 Array<RCP<CommStatus<int>>> statuses(actualNumReceives);
821
822 // Teuchos::Comm treats a negative process ID as MPI_ANY_SOURCE
823 // (receive data from any process).
824#ifdef HAVE_MPI
825 const int anySourceProc = MPI_ANY_SOURCE;
826#else
827 const int anySourceProc = -1;
828#endif
829
830 // Post the (nonblocking) receives.
831 for (size_t i = 0; i < actualNumReceives; ++i) {
832 // Once the receive completes, we can ask the corresponding
833 // CommStatus object (output by wait()) for the sending process'
834 // ID (which we'll assign to procsFrom_[i] -- don't forget to
835 // do that!).
836 lengthsFromBuffers[i].resize(1);
837 lengthsFromBuffers[i][0] = as<size_t>(0);
838 requests[i] = ireceive<int, size_t>(lengthsFromBuffers[i], anySourceProc,
839 mpiTag, *comm_);
840 }
841
842 // Post the sends: Tell each process to which we are sending how
843 // many packets it should expect from us in the communication
844 // pattern. We could use nonblocking sends here, as long as we do
845 // a waitAll() on all the sends and receives at once.
846 //
847 // We assume that numSendsToOtherProcs_ and sendMessageToSelf_ have already been
848 // set. The value of numSendsToOtherProcs_ (my process' number of sends) does
849 // not include any message that it might send to itself.
850 for (size_t i = 0; i < numSendsToOtherProcs_ + (sendMessageToSelf_ ? 1 : 0); ++i) {
851 if (procIdsToSendTo_[i] != myRank) {
852 // Send a message to procIdsToSendTo_[i], telling that process that
853 // this communication pattern will send that process
854 // lengthsTo_[i] blocks of packets.
855 const size_t* const lengthsTo_i = &lengthsTo_[i];
856 send<int, size_t>(lengthsTo_i, 1, as<int>(procIdsToSendTo_[i]), mpiTag, *comm_);
857 } else {
858 // We don't need a send in the self-message case. If this
859 // process will send a message to itself in the communication
860 // pattern, then the last element of lengthsFrom_ and
861 // procsFrom_ corresponds to the self-message. Of course
862 // this process knows how long the message is, and the process
863 // ID is its own process ID.
864 lengthsFrom_[numReceives_ - 1] = lengthsTo_[i];
865 procsFrom_[numReceives_ - 1] = myRank;
866 }
867 }
868
869 //
870 // Wait on all the receives. When they arrive, check the status
871 // output of wait() for the receiving process ID, unpack the
872 // request buffers into lengthsFrom_, and set procsFrom_ from the
873 // status.
874 //
875 waitAll(*comm_, requests(), statuses());
876 for (size_t i = 0; i < actualNumReceives; ++i) {
877 lengthsFrom_[i] = *lengthsFromBuffers[i];
878 procsFrom_[i] = statuses[i]->getSourceRank();
879 }
880
881 // Sort the procsFrom_ array, and apply the same permutation to
882 // lengthsFrom_. This ensures that procsFrom_[i] and
883 // lengthsFrom_[i] refers to the same thing.
884 sort2(procsFrom_.begin(), procsFrom_.end(), lengthsFrom_.begin());
885
886 // Compute indicesFrom_
887 totalReceiveLength_ =
888 std::accumulate(lengthsFrom_.begin(), lengthsFrom_.end(), 0);
889 indicesFrom_.clear();
890
891 startsFrom_.clear();
892 startsFrom_.reserve(numReceives_);
893 for (size_t i = 0, j = 0; i < numReceives_; ++i) {
894 startsFrom_.push_back(j);
895 j += lengthsFrom_[i];
896 }
897
898 if (sendMessageToSelf_) {
899 --numReceives_;
900 }
901}
902
903void DistributorPlan::setParameterList(const Teuchos::RCP<Teuchos::ParameterList>& plist) {
904 using std::endl;
905 using Teuchos::FancyOStream;
906 using Teuchos::getIntegralValue;
907 using Teuchos::ParameterList;
908 using Teuchos::parameterList;
909 using Teuchos::RCP;
910
911 if (!plist.is_null()) {
912 RCP<const ParameterList> validParams = getValidParameters();
913 plist->validateParametersAndSetDefaults(*validParams);
914
915 const Details::EDistributorSendType sendType =
916 getIntegralValue<Details::EDistributorSendType>(*plist, "Send type");
917
918 // Now that we've validated the input list, save the results.
919 sendType_ = sendType;
920
921#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
922 initializeMpiAdvance();
923#endif
924
925 // ParameterListAcceptor semantics require pointer identity of the
926 // sublist passed to setParameterList(), so we save the pointer.
927 this->setMyParamList(plist);
928
929#if defined(HAVE_TPETRA_MPI)
930 maybeInitializeRoots();
931#endif
932 }
933}
934
935Teuchos::Array<std::string> distributorSendTypes() {
936 Teuchos::Array<std::string> sendTypes;
937 sendTypes.push_back("Isend");
938 sendTypes.push_back("Send");
939 sendTypes.push_back("Alltoall");
940#if defined(HAVE_TPETRA_MPI)
941 sendTypes.push_back("Ialltofewv");
942#endif
943#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
944 sendTypes.push_back("MpiAdvanceAlltoall");
945 sendTypes.push_back("MpiAdvanceNbralltoallv");
946#endif
947 return sendTypes;
948}
949
950Teuchos::Array<EDistributorSendType> distributorSendTypeEnums() {
951 Teuchos::Array<EDistributorSendType> res;
952 res.push_back(DISTRIBUTOR_ISEND);
953 res.push_back(DISTRIBUTOR_SEND);
954 res.push_back(DISTRIBUTOR_ALLTOALL);
955#if defined(HAVE_TPETRA_MPI)
956 res.push_back(DISTRIBUTOR_IALLTOFEWV);
957#endif
958#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
961#endif
962 return res;
963}
964
965Teuchos::RCP<const Teuchos::ParameterList>
966DistributorPlan::getValidParameters() const {
967 using Teuchos::Array;
968 using Teuchos::ParameterList;
969 using Teuchos::parameterList;
970 using Teuchos::RCP;
971 using Teuchos::setStringToIntegralParameter;
972
973 Array<std::string> sendTypes = distributorSendTypes();
974 const Array<Details::EDistributorSendType> sendTypeEnums = distributorSendTypeEnums();
975
976 const std::string validatedSendType = validSendTypeOrThrow(Behavior::defaultSendType());
977
978 RCP<ParameterList> plist = parameterList("Tpetra::Distributor");
979
982 "When using MPI, the variant of send to use in "
983 "do[Reverse]Posts()",
984 sendTypes(), sendTypeEnums(), plist.getRawPtr());
985 plist->set("Timer Label", "", "Label for Time Monitor output");
986
987 return Teuchos::rcp_const_cast<const ParameterList>(plist);
988}
989
990#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
991
992// Used by Teuchos::RCP to clean up an owned MPIX_Comm*
993struct MpixCommDeallocator {
994 void free(MPIX_Comm** comm) const {
995 MPIX_Comm_free(comm);
996 }
997};
998
999void DistributorPlan::initializeMpiAdvance() {
1000 // assert the mpix communicator is null. if this is not the case we will figure out why
1001 TEUCHOS_ASSERT(mpixComm_.is_null());
1002
1003 // use the members to initialize the graph for neightborhood mode, or just the MPIX communicator for non-neighborhood mode
1004 Teuchos::RCP<const Teuchos::MpiComm<int>> mpiComm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int>>(comm_);
1005 Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm>> rawComm = mpiComm->getRawMpiComm();
1006 int err = 0;
1007 if (sendType_ == DISTRIBUTOR_MPIADVANCE_ALLTOALL ||
1008 sendType_ == DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV) {
1009 MPIX_Comm** mpixComm = new (MPIX_Comm*);
1010 err = MPIX_Comm_init(mpixComm, (*rawComm)());
1011 mpixComm_ = Teuchos::RCP(mpixComm,
1012 MpixCommDeallocator(),
1013 true /*take ownership*/
1014 );
1015 }
1016
1017 TEUCHOS_ASSERT(err == 0);
1018}
1019#endif
1020
1021#if defined(HAVE_TPETRA_MPI)
1022// FIXME: probably need to rename this function since it might change the sendType
1023void DistributorPlan::maybeInitializeRoots() {
1024 // Only IALLTOFEWV needs to know the roots
1025 if (DISTRIBUTOR_IALLTOFEWV != sendType_) {
1026 roots_.clear();
1027 return;
1028 }
1029
1030 ProfilingRegion region_maybeInitializeRoots("Tpetra::DistributorPlan::maybeInitializeRoots");
1031
1032 // send my number of recvs to everyone
1033 const int numRecvs = (int)(getNumReceives() + (hasSelfMessage() ? 1 : 0));
1034 std::vector<int> sendbuf(comm_->getSize(), numRecvs);
1035 std::vector<int> recvbuf(comm_->getSize());
1036
1037 // FIXME: is there a more natural way to do this?
1038 // Maybe MPI_Allreduce is better, we just care if anyone is sending anything to each process
1039 // we just need to know all processes that receive anything (including a self message)
1040 Teuchos::RCP<const Teuchos::MpiComm<int>> mpiComm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int>>(comm_);
1041 Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm>> rawComm = mpiComm->getRawMpiComm();
1042 MPI_Comm comm = (*rawComm)();
1043 MPI_Alltoall(sendbuf.data(), 1, MPI_INT, recvbuf.data(), 1, MPI_INT, comm);
1044
1045 roots_.clear();
1046 for (size_t root = 0; root < recvbuf.size(); ++root) {
1047 if (recvbuf[root] > 0) {
1048 roots_.push_back(root);
1049 }
1050 }
1051
1052 // In "slow-path" communication, the data is not blocked according to sending / receiving proc.
1053 // The root-detection algorithm expects data to be blocked, so disable.
1054 int slow = !getIndicesTo().is_null() ? 1 : 0;
1055 MPI_Allreduce(MPI_IN_PLACE, &slow, 1, MPI_INT, MPI_LOR, comm);
1056 if (slow) {
1058 {
1059 std::stringstream ss;
1060 ss << __FILE__ << ":" << __LINE__ << " " << comm_->getRank() << ": WARNING: Ialltoallv send mode set, at least one rank's data is not grouped by rank. Setting to \"Send\"" << std::endl;
1061 std::cerr << ss.str();
1062 }
1063 }
1064
1065 roots_.clear();
1066 sendType_ = DISTRIBUTOR_SEND;
1067 }
1068
1069 // if there aren't many roots, probably someone wanted to use a gather somewhere but then just reused the import/export thing for a scatter
1070 // which this won't work well for
1071 // just fall back to SEND if roots are more than sqrt of comm
1072 if (roots_.size() * roots_.size() >= size_t(comm_->getSize())) {
1074 std::stringstream ss;
1075 ss << __FILE__ << ":" << __LINE__ << " " << comm_->getRank() << ": WARNING (Ialltoallv send type): too many roots (" << roots_.size() << ") for " << comm_->getSize() << " ranks. Setting send-type to \"Send\"" << std::endl;
1076 std::cerr << ss.str();
1077 }
1078 roots_.clear();
1079 sendType_ = DISTRIBUTOR_SEND;
1080 }
1081}
1082#endif // HAVE_TPETRA_MPI
1083
1084DistributorPlan::SubViewLimits DistributorPlan::getImportViewLimits(size_t numPackets) const {
1085 const size_t actualNumReceives = getNumReceives() + (hasSelfMessage() ? 1 : 0);
1086
1087 IndexView importStarts(actualNumReceives);
1088 IndexView importLengths(actualNumReceives);
1089
1090 size_t offset = 0;
1091 for (size_t i = 0; i < actualNumReceives; ++i) {
1092 importStarts[i] = offset;
1093 offset += getLengthsFrom()[i] * numPackets;
1094 importLengths[i] = getLengthsFrom()[i] * numPackets;
1095 }
1096 return std::make_pair(importStarts, importLengths);
1097}
1098
1099DistributorPlan::SubViewLimits DistributorPlan::getImportViewLimits(const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID) const {
1100 const size_t actualNumReceives = getNumReceives() + (hasSelfMessage() ? 1 : 0);
1101
1102 IndexView importStarts(actualNumReceives);
1103 IndexView importLengths(actualNumReceives);
1104
1105 size_t offset = 0;
1106 size_t curLIDoffset = 0;
1107 for (size_t i = 0; i < actualNumReceives; ++i) {
1108 size_t totalPacketsFrom_i = 0;
1109 for (size_t j = 0; j < getLengthsFrom()[i]; ++j) {
1110 totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset + j];
1111 }
1112 curLIDoffset += getLengthsFrom()[i];
1113 importStarts[i] = offset;
1114 offset += totalPacketsFrom_i;
1115 importLengths[i] = totalPacketsFrom_i;
1116 }
1117 return std::make_pair(importStarts, importLengths);
1118}
1119
1120DistributorPlan::SubViewLimits DistributorPlan::getExportViewLimits(size_t numPackets) const {
1121 if (getIndicesTo().is_null()) {
1122 const size_t actualNumSends = getNumSends() + (hasSelfMessage() ? 1 : 0);
1123 IndexView exportStarts(actualNumSends);
1124 IndexView exportLengths(actualNumSends);
1125 for (size_t pp = 0; pp < actualNumSends; ++pp) {
1126 exportStarts[pp] = getStartsTo()[pp] * numPackets;
1127 exportLengths[pp] = getLengthsTo()[pp] * numPackets;
1128 }
1129 return std::make_pair(exportStarts, exportLengths);
1130 } else {
1131 const size_t numIndices = getIndicesTo().size();
1132 IndexView exportStarts(numIndices);
1133 IndexView exportLengths(numIndices);
1134 for (size_t j = 0; j < numIndices; ++j) {
1135 exportStarts[j] = getIndicesTo()[j] * numPackets;
1136 exportLengths[j] = numPackets;
1137 }
1138 return std::make_pair(exportStarts, exportLengths);
1139 }
1140}
1141
1142DistributorPlan::SubViewLimits DistributorPlan::getExportViewLimits(const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID) const {
1143 if (getIndicesTo().is_null()) {
1144 const size_t actualNumSends = getNumSends() + (hasSelfMessage() ? 1 : 0);
1145 IndexView exportStarts(actualNumSends);
1146 IndexView exportLengths(actualNumSends);
1147 size_t offset = 0;
1148 for (size_t pp = 0; pp < actualNumSends; ++pp) {
1149 size_t numPackets = 0;
1150 for (size_t j = getStartsTo()[pp];
1151 j < getStartsTo()[pp] + getLengthsTo()[pp]; ++j) {
1152 numPackets += numExportPacketsPerLID[j];
1153 }
1154 exportStarts[pp] = offset;
1155 offset += numPackets;
1156 exportLengths[pp] = numPackets;
1157 }
1158 return std::make_pair(exportStarts, exportLengths);
1159 } else {
1160 const size_t numIndices = getIndicesTo().size();
1161 IndexView exportStarts(numIndices);
1162 IndexView exportLengths(numIndices);
1163 size_t offset = 0;
1164 for (size_t j = 0; j < numIndices; ++j) {
1165 exportStarts[j] = offset;
1166 offset += numExportPacketsPerLID[j];
1167 exportLengths[j] = numExportPacketsPerLID[j];
1168 }
1169 return std::make_pair(exportStarts, exportLengths);
1170 }
1171}
1172
1173} // namespace Details
1174} // namespace Tpetra
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Stand-alone utility functions and macros.
#define TPETRA_EFFICIENCY_WARNING(throw_exception_test, msg)
Print or throw an efficency warning.
#define SHARED_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg, comm)
Test for exception, with reduction over the given communicator.
Struct that holds views of the contents of a CrsMatrix.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
Implementation details of Tpetra.
std::string DistributorSendTypeEnumToString(EDistributorSendType sendType)
Convert an EDistributorSendType enum value to a string.
EDistributorSendType
The type of MPI send that Distributor should use.
Teuchos::Array< EDistributorSendType > distributorSendTypeEnums()
Valid enum values of distributor send types.
Teuchos::Array< std::string > distributorSendTypes()
Valid string values for Distributor's "Send type" parameter.
EDistributorSendType DistributorSendTypeStringToEnum(const std::string_view s)
Convert a string to an EDistributorSendType. Throw on error.
const std::string & validSendTypeOrThrow(const std::string &s)
Valid enum values of distributor send types.
EDistributorHowInitialized
Enum indicating how and whether a Distributor was initialized.
std::string DistributorHowInitializedEnumToString(EDistributorHowInitialized how)
Convert an EDistributorHowInitialized enum value to a string.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2, const bool stableSort=false)
Sort the first array, and apply the resulting permutation to the second array.